├── .gitignore ├── LICENSE ├── README.md ├── examples ├── README.md ├── netflow │ ├── config_example_netflow_nodp.json │ └── driver.py └── pcap │ ├── config_example_pcap_nodp.json │ └── driver.py ├── netshare ├── __init__.py ├── configs │ └── default │ │ ├── __init__.py │ │ ├── dg_table_row_per_sample.json │ │ └── single_event_per_row.json ├── generators │ ├── __init__.py │ └── generator.py ├── model_managers │ ├── __init__.py │ ├── dg_model_manager.py │ ├── model_manager.py │ └── netshare_manager │ │ ├── generate_helper.py │ │ ├── netshare_manager.py │ │ ├── netshare_util.py │ │ └── train_helper.py ├── models │ ├── __init__.py │ ├── doppelganger_torch │ │ ├── __init__.py │ │ ├── doppelganger.py │ │ ├── load_data.py │ │ ├── network.py │ │ ├── privacy_util.py │ │ └── util.py │ ├── doppelganger_torch_model.py │ └── model.py ├── pre_post_processors │ ├── __init__.py │ ├── dg_row_per_sample_pre_post_processor.py │ ├── netshare │ │ ├── README.md │ │ ├── choose_best_model.py │ │ ├── denormalize_fields.py │ │ ├── dist_metrics.py │ │ ├── embedding_helper.py │ │ ├── main.c │ │ ├── netshare_pre_post_processor.py │ │ ├── packet.h │ │ ├── preprocess_helper.py │ │ ├── sharedlib.sh │ │ ├── util.py │ │ └── word2vec_embedding.py │ └── pre_post_processor.py ├── ray │ ├── __init__.py │ ├── config.py │ ├── ray_functions.py │ └── remote.py └── utils │ ├── __init__.py │ ├── exec_cmd.py │ ├── field.py │ ├── logger.py │ ├── output.py │ └── tee.py ├── setup.py ├── traces ├── README.md ├── caida-small │ └── raw.pcap └── ugr16-small │ └── raw.csv └── util ├── README.md ├── grow-rootfs.sh ├── ray ├── check_nodes.py └── example.yaml ├── setup-cpu.sh └── setup_node_parallel.sh /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | .DS_Store 4 | .vscode/ 5 | placeholder 6 | data 7 | debug 8 | 9 | netshare/dashboard/static/tmp 10 | 11 | tests/ 12 | traces/ 13 | results/ 14 | rsync*.sh 15 | *.pkl 16 | *.ini 17 | 18 | # Byte-compiled / optimized / DLL files 19 | __pycache__/ 20 | *.py[cod] 21 | *$py.class 22 | 23 | # C extensions 24 | *.so 25 | *.o 26 | 27 | # Distribution / packaging 28 | .Python 29 | build/ 30 | develop-eggs/ 31 | dist/ 32 | downloads/ 33 | eggs/ 34 | .eggs/ 35 | lib/ 36 | lib64/ 37 | parts/ 38 | sdist/ 39 | var/ 40 | wheels/ 41 | pip-wheel-metadata/ 42 | share/python-wheels/ 43 | *.egg-info/ 44 | .installed.cfg 45 | *.egg 46 | MANIFEST 47 | 48 | # PyInstaller 49 | # Usually these files are written by a python script from a template 50 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 51 | *.manifest 52 | *.spec 53 | 54 | # Installer logs 55 | pip-log.txt 56 | pip-delete-this-directory.txt 57 | 58 | # Unit test / coverage reports 59 | htmlcov/ 60 | .tox/ 61 | .nox/ 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *.cover 68 | *.py,cover 69 | .hypothesis/ 70 | .pytest_cache/ 71 | 72 | # Translations 73 | *.mo 74 | *.pot 75 | 76 | # Django stuff: 77 | *.log 78 | local_settings.py 79 | db.sqlite3 80 | db.sqlite3-journal 81 | 82 | # Flask stuff: 83 | instance/ 84 | .webassets-cache 85 | 86 | # Scrapy stuff: 87 | .scrapy 88 | 89 | # Sphinx documentation 90 | docs/_build/ 91 | 92 | # PyBuilder 93 | target/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # IPython 99 | profile_default/ 100 | ipython_config.py 101 | 102 | # pyenv 103 | .python-version 104 | 105 | # pipenv 106 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 107 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 108 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 109 | # install all needed dependencies. 110 | #Pipfile.lock 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The Clear BSD License 2 | 3 | Copyright (c) 2022 Carnegie Mellon University 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, are permitted (subject to the limitations in the disclaimer below) provided that the following conditions are met: 7 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | * Neither the name of Carnegie Mellon University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Practical GAN-based Synthetic IP Header Trace Generation using NetShare 2 | 3 | [[paper (SIGCOMM 2022)](https://dl.acm.org/doi/abs/10.1145/3544216.3544251)] 4 | [[talk (SIGCOMM 2022)](https://www.youtube.com/watch?v=mWnFIncjtWg)] 5 | [[talk (ZeekWeek 2022)]](https://www.youtube.com/watch?v=MN_fa-FBOHg) 6 | [[talk (FloCon 2023)]](https://resources.sei.cmu.edu/library/asset-view.cfm?assetid=890917) 7 | [[web service demo](https://drive.google.com/file/d/1vPuneEb14A2w7fKyCJ41NAHzsvpLQP5H/view)] 8 | 9 | **Authors:** 10 | [[Yucheng Yin](https://sniperyyc.com/)] 11 | [[Zinan Lin](http://www.andrew.cmu.edu/user/zinanl/)] 12 | [[Minhao Jin](https://www.linkedin.com/in/minhao-jin-1328b8164/)] 13 | [[Giulia Fanti](https://www.andrew.cmu.edu/user/gfanti/)] 14 | [[Vyas Sekar](https://users.ece.cmu.edu/~vsekar/)] 15 | 16 | **Abstract:** We explore the feasibility of using Generative Adversarial Networks (GANs) to automatically learn generative models to generate synthetic packet- and flow header traces for network-ing tasks (e.g., telemetry, anomaly detection, provisioning). We identify key fidelity, scalability, and privacy challenges and tradeoffs in existing GAN-based approaches. By synthesizing domain-specific insights with recent advances in machine learning and privacy, we identify design choices to tackle these challenges. Building on these insights, we develop an end-to-end framework, NetShare. We evaluate NetShare on six diverse packet header traces and find that: (1) across distributional metrics and traces, it achieves 46% more accuracy than baselines, and (2) it meets users’ requirements of downstream tasks in evaluating accuracy and rank ordering of candidate approaches. 17 | 18 | # News 19 | [2023.04] Woohoo! New version released with a list of new features: 20 | - Bump Python version to 3.9 21 | - Replace tensorflow 1.15 with torch 22 | - Support generic dataset formats 23 | - Add [SDMetrics](https://github.com/netsharecmu/SDMetrics_timeseries/tree/master/sdmetrics) for hyperparameter/model selection and data visualization 24 | 25 | [2022.08]: The deprecated [`camera-ready`](https://github.com/netsharecmu/NetShare/releases/tag/camera-ready-deprecated) branch holds the scripts we used to run all the experiments in the [paper](https://dl.acm.org/doi/abs/10.1145/3544216.3544251). 26 | 27 | # Users 28 | NetShare has been used by several independent users/companies. 29 | 30 | - [Purdue CS536 Fall 2022 Class project](https://github.com/annuszulfiqar2021/NetShare) 31 | - [Rockfish Data](https://rockfish.ai/index.html) 32 | 33 | # Datasets 34 | ***We are adding more datasets! Feel free to add your own and contribute!*** 35 | 36 | Our paper uses **six** public datasets for reproducibility. Please download the six datasets [here](https://drive.google.com/drive/folders/1FOl1VMr0tXhzKEOupxnJE9YQ2GwfX2FD?usp=sharing) and put them under `traces/`. 37 | 38 | You may also refer to the [README](traces/README.md) for detailed descriptions of the datasets. 39 | 40 | 41 | # Setup 42 | ## Step 0: Install `libpcap` depdency (Optional) 43 | If you are working with PCAP files and you have not installed `libpcap`, 44 | - On MacOS, install using `homebrew`: 45 | ```Bash 46 | brew install libpcap 47 | ``` 48 | - On Debian-based system (e.g., Ubuntu), install using `apt`: 49 | ```Bash 50 | sudo apt install libpcap-dev 51 | ``` 52 | 53 | ## Step 1: Install NetShare Python package (Required) 54 | We recommend installing NetShare in a virtual environment (e.g., Anaconda3). We test with virtual environment with Python==3.9. 55 | 56 | ```Bash 57 | # Assume Anaconda is installed 58 | # Create virtual environment if not exists 59 | conda create --name NetShare python=3.9 60 | 61 | # Activate virtual env 62 | conda activate NetShare 63 | 64 | # Install NetShare package 65 | git clone https://github.com/netsharecmu/NetShare.git 66 | pip3 install -e NetShare/ 67 | 68 | # Install SDMetrics package 69 | git clone https://github.com/netsharecmu/SDMetrics_timeseries 70 | pip3 install -e SDMetrics_timeseries/ 71 | ``` 72 | 73 | ## Step 2: How to start Ray? (Optional but **strongly** recommended) 74 | Ray is a unified framework for scaling AI and Python applications. Our framework utilizes Ray to increase parallelism and distribute workloads among the cluster automatically and efficiently. 75 | 76 | ### Laptop/Single-machine (only recommended for demo/dev/fun) 77 | ``` 78 | ray start --head --port=6379 --include-dashboard=True --dashboard-host=0.0.0.0 --dashboard-port=8265 79 | ``` 80 | 81 | Please go to [http://localhost:8265](http://localhost:8265) to view the Ray dashboard. 82 | 83 | 84 | ### Multi-machines (**strongly** recommended for faster training/generation) 85 | We provide a utility script and [README](util/README.md) under `util/` for setting up a Ray cluster. As a reference, we are using [Cloudlab](https://www.cloudlab.us/) which is referred as ``custom cluster'' in the Ray documentation. If you are using a different cluster (e.g., AWS, GCP, Azure), please refer to the [Ray doc](https://docs.ray.io/en/releases-2.0.0rc0/cluster/cloud.html#cluster-cloud) for full reference. 86 | 87 | 88 | 89 | # Example usage 90 | ***We are adding more examples of usage (PCAP, NetFlow, w/ and w/o DP). Please stay tuned!*** 91 | 92 | Here is a minimal working example to generate synthetic netflow files without differential privacy. Please change your working directory to `examples/` by `cd examples/`. 93 | 94 | You may refer to [`examples`](examples/) for more scripts and config files. 95 | 96 | [Driver code](examples/netflow/driver.py) 97 | ```Python 98 | import random 99 | import netshare.ray as ray 100 | from netshare import Generator 101 | 102 | if __name__ == '__main__': 103 | # Change to False if you would not like to use Ray 104 | ray.config.enabled = False 105 | ray.init(address="auto") 106 | 107 | # configuration file 108 | generator = Generator(config="config_example_netflow_nodp.json") 109 | 110 | # `work_folder` should not exist o/w an overwrite error will be thrown. 111 | # Please set the `worker_folder` as *absolute path* 112 | # if you are using Ray with multi-machine setup 113 | # since Ray has bugs when dealing with relative paths. 114 | generator.train(work_folder=f'../../results/test-ugr16') 115 | generator.generate(work_folder=f'../../results/test-ugr16') 116 | generator.visualize(work_folder=f'../../results/test-ugr16') 117 | 118 | ray.shutdown() 119 | ``` 120 | 121 | The corresponding [configuration file](examples/netflow/config_example_netflow_nodp.json). 122 | You may refer to [README](netshare/configs/README.md) for detailed explanations of the configuration files. 123 | 124 | After generation, you will be redirected to a dashboard where a side-to-side visual comparison between real and synthetic data will be shown. 125 | 126 | # Codebase structure (for *dev* purpose) 127 | ``` 128 | ├── doc # (tentative) NetShare tutorials and APIs 129 | ├── examples # Examples of using NetShare on different datasets 130 | ├── netshare # NetShare source code 131 | │   ├── configs # Default configurations 132 | │   ├── generators # Generator class 133 | │   ├── model_managers # Core of NetShare service (i.e, train/generate) 134 | │   ├── models # Timeseries GAN models (e.g., DoppelGANger) 135 | │   ├── pre_post_processors # Pre- and post-process data 136 | │   ├── ray # Ray functions overloading 137 | │   └── utils # Utility functions/common class definitions 138 | ├── traces # Traces/datasets 139 | └── util # MISC/setup scripts 140 | └── ray # Ray setup script 141 | ``` 142 | 143 | 144 | # References 145 | Please cite our paper/codebase approriately if you find NetShare is useful. 146 | 147 | ```bibtex 148 | @inproceedings{netshare-sigcomm2022, 149 | author = {Yin, Yucheng and Lin, Zinan and Jin, Minhao and Fanti, Giulia and Sekar, Vyas}, 150 | title = {Practical GAN-Based Synthetic IP Header Trace Generation Using NetShare}, 151 | year = {2022}, 152 | isbn = {9781450394208}, 153 | publisher = {Association for Computing Machinery}, 154 | address = {New York, NY, USA}, 155 | url = {https://doi.org/10.1145/3544216.3544251}, 156 | doi = {10.1145/3544216.3544251}, 157 | abstract = {We explore the feasibility of using Generative Adversarial Networks (GANs) to automatically learn generative models to generate synthetic packet- and flow header traces for networking tasks (e.g., telemetry, anomaly detection, provisioning). We identify key fidelity, scalability, and privacy challenges and tradeoffs in existing GAN-based approaches. By synthesizing domain-specific insights with recent advances in machine learning and privacy, we identify design choices to tackle these challenges. Building on these insights, we develop an end-to-end framework, NetShare. We evaluate NetShare on six diverse packet header traces and find that: (1) across all distributional metrics and traces, it achieves 46% more accuracy than baselines and (2) it meets users' requirements of downstream tasks in evaluating accuracy and rank ordering of candidate approaches.}, 158 | booktitle = {Proceedings of the ACM SIGCOMM 2022 Conference}, 159 | pages = {458–472}, 160 | numpages = {15}, 161 | keywords = {privacy, synthetic data generation, network packets, network flows, generative adversarial networks}, 162 | location = {Amsterdam, Netherlands}, 163 | series = {SIGCOMM '22} 164 | } 165 | ``` 166 | 167 | Part of the source code is adapated from the following open-source projects: 168 | 169 | - [DoppelGANger](https://github.com/fjxmlzn/DoppelGANger) 170 | - [GPUTaskScheduler](https://github.com/fjxmlzn/GPUTaskScheduler) 171 | - [BSN](https://github.com/fjxmlzn/BSN) 172 | - [Ray](https://github.com/ray-project/ray) 173 | - [config_io](https://github.com/fjxmlzn/config_io) 174 | - [SDMetrics](https://github.com/sdv-dev/SDMetrics) 175 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | We support multiple common data schemas and here are a few examples with corresponding configuration files. You may find the "nearest match" to start with. 2 | 3 | **Note: across all examples, `iteration` are set to a small number to ensure a quick E2E test. For generating high-quality synthetic data, we recommend increasing `iteration` by your experience and computational resources.** 4 | 5 | # Prerequiste 6 | We support four different fields: 7 | 1. Bit field (encoded as bit strings) e.g., 8 | ```JSON 9 | { 10 | "column": "srcip", 11 | "type": "integer", 12 | "encoding": "bit", 13 | "n_bits": 32 14 | } 15 | ``` 16 | An optional property to this field is `truncate`, which is a boolean value with default `False`. If `truncate` is set to `true`, then we will truncate large integers and consider only the most significant `n_bits` bits. 17 | 18 | 2. Word2Vec field (encoded as Word2Vec vectors), e.g., 19 | ```JSON 20 | { 21 | "column": "srcport", 22 | "type": "integer", 23 | "encoding": "word2vec_port" 24 | } 25 | ``` 26 | 3. Categorical field (encoded as one-hot encoding), e.g., 27 | ```JSON 28 | { 29 | "column": "type", 30 | "type": "string", 31 | "encoding": "categorical" 32 | } 33 | ``` 34 | 4. Continuous field, e.g., 35 | ```JSON 36 | { 37 | "column": "pkt", 38 | "type": "float", 39 | "normalization": "ZERO_ONE", 40 | "log1p_norm": true 41 | } 42 | ``` 43 | 44 | # Dataset type 1: single-event 45 | Single-event schema contains one timeseries per row. 46 | 47 | ## Data schema 48 | | Timestamp (optional) | Metadata 1 | Metadata 2 | ... | Timeseries 1 | Timeseries 2 | ... | 49 | |:--------------------:|:----------:|:----------:|:---:|:-------------:|:-------------:|:---:| 50 | | t1 | | | | | | | 51 | | t2 | | | | | | | 52 | | ... | | | | | | | 53 | 54 | ## Examples 55 | 1. PCAP 56 | | Timestamp | Srcip | Dstip | Srcport | Dstport | Proto | Pkt_size | ... | 57 | |:---------:|:-----:|:-----:|:-------:|:-------:|:-----:|:--------:|:---:| 58 | | t1 | | | | | | | | 59 | | t2 | | | | | | | | 60 | | ... | | | | | | | | 61 | 62 | 2. NetFlow ([configuration_file](netflow/config_example_netflow_nodp.json)) 63 | 64 | 70 | 71 | 72 | 73 | # [Dataset type 2: multi-event](./dg_table_row_per_sample/README.md) 74 | Multi-event data schema contains multiple timeseries per row. 75 | 76 | ## Data Schema 77 | | Metadata 1 | Metadata 2 | ... | {Timestamp (optional), Timeseries 1, Timeseries 2, ...} | {Timestamp (optional), Timeseries 1, Timeseries 2, ...} | ... | 78 | |:----------:|:----------:|:---:|:-------------------------------------------------------:|:-------------------------------------------------------:|:---:| 79 | | | | | | | | 80 | | | | | | | | 81 | 82 | ## Examples 83 | 1. Wikipedia dataset ([configuration_file](./dg_table_row_per_sample/config_example_wiki.json)) 84 | | Domain | Access type | Agent | {Date 1, page view} | {Date 2, page view} | ... | 85 | |:------:|:-----------:|:-----:|:-------------------:|:-------------------:|:---:| 86 | | | | | | | | 87 | | | | | | | | 88 | 89 | -------------------------------------------------------------------------------- /examples/netflow/config_example_netflow_nodp.json: -------------------------------------------------------------------------------- 1 | { 2 | "global_config": { 3 | "original_data_file": "../../traces/ugr16-small/raw.csv", 4 | "overwrite": true, 5 | "dataset_type": "netflow", 6 | "n_chunks": 2, 7 | "dp": false 8 | }, 9 | "default": "single_event_per_row.json", 10 | "pre_post_processor": { 11 | "class": "NetsharePrePostProcessor", 12 | "config": { 13 | "timestamp": { 14 | "column": "ts", 15 | "generation": true, 16 | "encoding": "interarrival", 17 | "normalization": "ZERO_ONE" 18 | }, 19 | "word2vec": { 20 | "vec_size": 10, 21 | "model_name": "word2vec_vecSize", 22 | "annoy_n_trees": 100, 23 | "pretrain_model_path": null 24 | }, 25 | "metadata": [ 26 | { 27 | "column": "srcip", 28 | "type": "integer", 29 | "encoding": "bit", 30 | "n_bits": 32, 31 | "categorical_mapping": false 32 | }, 33 | { 34 | "column": "dstip", 35 | "type": "integer", 36 | "encoding": "bit", 37 | "n_bits": 32, 38 | "categorical_mapping": false 39 | }, 40 | { 41 | "column": "srcport", 42 | "type": "integer", 43 | "encoding": "word2vec_port" 44 | }, 45 | { 46 | "column": "dstport", 47 | "type": "integer", 48 | "encoding": "word2vec_port" 49 | }, 50 | { 51 | "column": "proto", 52 | "type": "string", 53 | "encoding": "word2vec_proto" 54 | } 55 | ], 56 | "timeseries": [ 57 | { 58 | "column": "td", 59 | "type": "float", 60 | "normalization": "ZERO_ONE", 61 | "log1p_norm": true 62 | }, 63 | { 64 | "column": "pkt", 65 | "type": "float", 66 | "normalization": "ZERO_ONE", 67 | "log1p_norm": true 68 | }, 69 | { 70 | "column": "byt", 71 | "type": "float", 72 | "normalization": "ZERO_ONE", 73 | "log1p_norm": true 74 | }, 75 | { 76 | "column": "type", 77 | "type": "string", 78 | "encoding": "categorical" 79 | } 80 | ] 81 | } 82 | }, 83 | "model": { 84 | "class": "DoppelGANgerTorchModel", 85 | "config": { 86 | "batch_size": 100, 87 | "sample_len": [ 88 | 1, 89 | 5, 90 | 10 91 | ], 92 | "sample_len_expand": true, 93 | "epochs": 40, 94 | "extra_checkpoint_freq": 1, 95 | "epoch_checkpoint_freq": 5 96 | } 97 | } 98 | } -------------------------------------------------------------------------------- /examples/netflow/driver.py: -------------------------------------------------------------------------------- 1 | import random 2 | import netshare.ray as ray 3 | from netshare import Generator 4 | 5 | if __name__ == '__main__': 6 | # Change to False if you would not like to use Ray 7 | ray.config.enabled = False 8 | ray.init(address="auto") 9 | 10 | # configuration file 11 | generator = Generator(config="config_example_netflow_nodp.json") 12 | 13 | # `work_folder` should not exist o/w an overwrite error will be thrown. 14 | # Please set the `worker_folder` as *absolute path* 15 | # if you are using Ray with multi-machine setup 16 | # since Ray has bugs when dealing with relative paths. 17 | generator.train(work_folder=f'../../results/test-ugr16') 18 | generator.generate(work_folder=f'../../results/test-ugr16') 19 | generator.visualize(work_folder=f'../../results/test-ugr16') 20 | 21 | ray.shutdown() 22 | -------------------------------------------------------------------------------- /examples/pcap/config_example_pcap_nodp.json: -------------------------------------------------------------------------------- 1 | { 2 | "global_config": { 3 | "original_data_file": "../../traces/caida-small/raw.pcap", 4 | "overwrite": true, 5 | "dataset_type": "pcap", 6 | "n_chunks": 1, 7 | "dp": false 8 | }, 9 | "default": "single_event_per_row.json", 10 | "pre_post_processor": { 11 | "class": "NetsharePrePostProcessor", 12 | "config": { 13 | "timestamp": { 14 | "column": "time", 15 | "generation": true, 16 | "encoding": "interarrival", 17 | "normalization": "ZERO_ONE" 18 | }, 19 | "word2vec": { 20 | "vec_size": 10, 21 | "model_name": "word2vec_vecSize", 22 | "annoy_n_trees": 100, 23 | "pretrain_model_path": null 24 | }, 25 | "metadata": [ 26 | { 27 | "column": "srcip", 28 | "type": "integer", 29 | "encoding": "bit", 30 | "n_bits": 32, 31 | "categorical_mapping": false 32 | }, 33 | { 34 | "column": "dstip", 35 | "type": "integer", 36 | "encoding": "bit", 37 | "n_bits": 32, 38 | "categorical_mapping": false 39 | }, 40 | { 41 | "column": "srcport", 42 | "type": "integer", 43 | "encoding": "word2vec_port" 44 | }, 45 | { 46 | "column": "dstport", 47 | "type": "integer", 48 | "encoding": "word2vec_port" 49 | }, 50 | { 51 | "column": "proto", 52 | "type": "string", 53 | "encoding": "word2vec_proto" 54 | } 55 | ], 56 | "timeseries": [ 57 | { 58 | "column": "pkt_len", 59 | "type": "float", 60 | "normalization": "ZERO_ONE" 61 | }, 62 | { 63 | "column": "tos", 64 | "type": "float", 65 | "normalization": "ZERO_ONE", 66 | "min_x": 0.0, 67 | "max_x": 255.0 68 | }, 69 | { 70 | "column": "id", 71 | "type": "float", 72 | "normalization": "ZERO_ONE", 73 | "min_x": 0.0, 74 | "max_x": 65535.0 75 | }, 76 | { 77 | "column": "flag", 78 | "type": "integer", 79 | "encoding": "categorical", 80 | "choices": [ 81 | 0, 82 | 1, 83 | 2 84 | ] 85 | }, 86 | { 87 | "column": "off", 88 | "type": "float", 89 | "normalization": "ZERO_ONE", 90 | "min_x": 0.0, 91 | "max_x": 8191.0 92 | }, 93 | { 94 | "column": "ttl", 95 | "type": "float", 96 | "normalization": "ZERO_ONE", 97 | "min_x": 0.0, 98 | "max_x": 255.0 99 | } 100 | ] 101 | } 102 | }, 103 | "model": { 104 | "class": "DoppelGANgerTorchModel", 105 | "config": { 106 | "batch_size": 100, 107 | "sample_len": [ 108 | 10 109 | ], 110 | "sample_len_expand": true, 111 | "epochs": 40, 112 | "extra_checkpoint_freq": 1, 113 | "epoch_checkpoint_freq": 5 114 | } 115 | } 116 | } -------------------------------------------------------------------------------- /examples/pcap/driver.py: -------------------------------------------------------------------------------- 1 | import random 2 | import netshare.ray as ray 3 | from netshare import Generator 4 | 5 | if __name__ == '__main__': 6 | # Change to False if you would not like to use Ray 7 | ray.config.enabled = False 8 | ray.init(address="auto") 9 | 10 | # configuration file 11 | generator = Generator(config="config_example_pcap_nodp.json") 12 | 13 | # `work_folder` should not exist o/w an overwrite error will be thrown. 14 | # Please set the `worker_folder` as *absolute path* 15 | # if you are using Ray with multi-machine setup 16 | # since Ray has bugs when dealing with relative paths. 17 | generator.train(work_folder='../../results/test-caida') 18 | generator.generate(work_folder='../../results/test-caida') 19 | generator.visualize(work_folder='../../results/test-caida') 20 | 21 | ray.shutdown() 22 | -------------------------------------------------------------------------------- /netshare/__init__.py: -------------------------------------------------------------------------------- 1 | from .generators.generator import Generator 2 | 3 | __all__ = ['Generator'] 4 | -------------------------------------------------------------------------------- /netshare/configs/default/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netsharecmu/NetShare/af026037a88db486069209e2258e11c2df1b93e2/netshare/configs/default/__init__.py -------------------------------------------------------------------------------- /netshare/configs/default/dg_table_row_per_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "global_config": { 3 | "overwrite": false, 4 | "original_data_file": "" 5 | }, 6 | "pre_post_processor": { 7 | "class": "DGRowPerSamplePrePostProcessor", 8 | "config": { 9 | "num_train_samples": 50000, 10 | "num_test_samples": 50000, 11 | "metadata": [ 12 | ], 13 | "timeseries": [ 14 | ] 15 | } 16 | }, 17 | "model_manager": { 18 | "class": "DGModelManager", 19 | "config": { 20 | } 21 | }, 22 | "model": { 23 | "class": "DoppelGANgerTFModel", 24 | "config": { 25 | "batch_size": 100, 26 | "sample_len": 10, 27 | "iteration": 200000, 28 | "vis_freq": 100000, 29 | "vis_num_sample": 5, 30 | "d_rounds": 1, 31 | "g_rounds": 1, 32 | "num_packing": 1, 33 | "noise": true, 34 | "attr_noise_type": "normal", 35 | "feature_noise_type": "normal", 36 | "rnn_mlp_num_layers": 0, 37 | "feed_back": false, 38 | "g_lr": 0.001, 39 | "d_lr": 0.001, 40 | "d_gp_coe": 10.0, 41 | "gen_feature_num_layers": 1, 42 | "gen_feature_num_units": 100, 43 | "gen_attribute_num_layers": 3, 44 | "gen_attribute_num_units": 100, 45 | "disc_num_layers": 5, 46 | "disc_num_units": 200, 47 | "initial_state": "random", 48 | "leaky_relu": false, 49 | "attr_d_lr": 0.001, 50 | "attr_d_gp_coe": 10.0, 51 | "g_attr_d_coe": 1.0, 52 | "attr_disc_num_layers": 5, 53 | "attr_disc_num_units": 200, 54 | "aux_disc": true, 55 | "self_norm": true, 56 | "fix_feature_network": false, 57 | "debug": false, 58 | "combined_disc": true, 59 | "use_gt_lengths": false, 60 | "use_uniform_lengths": false, 61 | "num_cores": null, 62 | "sn_mode": null, 63 | "scale": 1.0, 64 | "extra_checkpoint_freq": 20000, 65 | "epoch_checkpoint_freq": 1000, 66 | "dp_noise_multiplier": null, 67 | "dp_l2_norm_clip": null 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /netshare/configs/default/single_event_per_row.json: -------------------------------------------------------------------------------- 1 | { 2 | "global_config": { 3 | "overwrite": false, 4 | "original_data_file": "traces/1M/ugr16/raw.csv", 5 | "dataset_type": "netflow", 6 | "n_chunks": 10, 7 | "dp": false, 8 | "allowed_data_types": [ 9 | "ip_string", 10 | "integer", 11 | "float", 12 | "string" 13 | ], 14 | "allowed_data_encodings": [ 15 | "categorical", 16 | "bit", 17 | "word2vec_port", 18 | "word2vec_proto" 19 | ] 20 | }, 21 | "pre_post_processor": { 22 | "class": "NetsharePrePostProcessor", 23 | "config": { 24 | "max_flow_len": null, 25 | "norm_option": 0, 26 | "split_name": "multichunk_dep_v2", 27 | "df2chunks": "fixed_time", 28 | "truncate": "per_chunk" 29 | } 30 | }, 31 | "model_manager": { 32 | "class": "NetShareManager", 33 | "config": { 34 | "pretrain_dir": null, 35 | "skip_chunk0_train": false, 36 | "pretrain_non_dp": true, 37 | "pretrain_non_dp_reduce_time": 4.0, 38 | "pretrain_dp": false, 39 | "run": 0 40 | } 41 | }, 42 | "model": { 43 | "class": "DoppelGANgerTorchModel", 44 | "config": { 45 | "batch_size": 100, 46 | "sample_len": [ 47 | 1, 48 | 5, 49 | 10 50 | ], 51 | "sample_len_expand": true, 52 | "iteration": 200000, 53 | "vis_freq": 100000, 54 | "vis_num_sample": 5, 55 | "d_rounds": 5, 56 | "g_rounds": 1, 57 | "num_packing": 1, 58 | "noise": true, 59 | "attr_noise_type": "normal", 60 | "feature_noise_type": "normal", 61 | "rnn_mlp_num_layers": 0, 62 | "feed_back": false, 63 | "g_lr": 0.0001, 64 | "g_beta1": 0.5, 65 | "d_lr": 0.0001, 66 | "d_beta1": 0.5, 67 | "d_gp_coe": 10.0, 68 | "adam_eps": 1e-8, 69 | "adam_amsgrad": false, 70 | "generator_feature_num_layers": 1, 71 | "generator_feature_num_units": 100, 72 | "generator_attribute_num_layers": 5, 73 | "generator_attribute_num_units": 512, 74 | "discriminator_num_layers": 5, 75 | "discriminator_num_units": 512, 76 | "initial_state": "random", 77 | "leaky_relu": false, 78 | "attr_d_lr": 0.0001, 79 | "attr_d_beta1": 0.5, 80 | "attr_d_gp_coe": 10.0, 81 | "g_attr_d_coe": 1.0, 82 | "attr_discriminator_num_layers": 5, 83 | "attr_discriminator_num_units": 512, 84 | "use_attr_discriminator": true, 85 | "self_norm": false, 86 | "fix_feature_network": false, 87 | "debug": false, 88 | "combined_disc": true, 89 | "use_gt_lengths": false, 90 | "use_uniform_lengths": false, 91 | "num_cores": null, 92 | "sn_mode": null, 93 | "scale": 1.0, 94 | "extra_checkpoint_freq": 20000, 95 | "epoch_checkpoint_freq": 1000, 96 | "dp_noise_multiplier": null, 97 | "dp_l2_norm_clip": null, 98 | "use_adaptive_rolling": false, 99 | "attribute_latent_dim": 5, 100 | "feature_latent_dim": 5 101 | } 102 | } 103 | } -------------------------------------------------------------------------------- /netshare/generators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netsharecmu/NetShare/af026037a88db486069209e2258e11c2df1b93e2/netshare/generators/__init__.py -------------------------------------------------------------------------------- /netshare/generators/generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import copy 4 | import warnings 5 | import pandas as pd 6 | 7 | import netshare.pre_post_processors as pre_post_processors 8 | import netshare.model_managers as model_managers 9 | import netshare.models as models 10 | from ..pre_post_processors.netshare.util import create_sdmetrics_config 11 | 12 | from config_io import Config 13 | from sdmetrics.reports.timeseries import QualityReport 14 | from ..configs import default as default_configs 15 | 16 | 17 | class Generator(object): 18 | def __init__(self, config): 19 | self._config = Config.load_from_file( 20 | config, 21 | default_search_paths=default_configs.__path__) 22 | config = copy.deepcopy(self._config) 23 | 24 | global_config = self._config["global_config"] 25 | 26 | if 'original_data_folder' in global_config and \ 27 | 'file_extension' not in global_config: 28 | raise ValueError('Input is a folder. ' 29 | 'Intended file extensions must be specified with ' 30 | '`file_extension=<.ext>` (e.g., `.pcap`, `.csv`') 31 | 32 | if 'original_data_folder' in global_config and \ 33 | "original_data_file" in global_config: 34 | raise ValueError( 35 | 'Input can be either a single file or \ 36 | a folder (with multiple valid files)!') 37 | self._ori_data_path = global_config['original_data_folder'] \ 38 | if 'original_data_folder' in global_config \ 39 | else global_config['original_data_file'] 40 | self._overwrite = global_config['overwrite'] 41 | 42 | pre_post_processor_class = getattr( 43 | pre_post_processors, config['pre_post_processor']['class']) 44 | pre_post_processor_config = Config(global_config) 45 | pre_post_processor_config.update( 46 | config['pre_post_processor']['config']) 47 | self._pre_post_processor = pre_post_processor_class( 48 | config=pre_post_processor_config) 49 | 50 | model_manager_class = getattr( 51 | model_managers, config['model_manager']['class']) 52 | model_manager_config = Config(global_config) 53 | model_manager_config.update(config['model_manager']['config']) 54 | self._model_manager = model_manager_class(config=model_manager_config) 55 | 56 | model_class = getattr(models, config['model']['class']) 57 | model_config = config['model']['config'] 58 | self._model = model_class 59 | self._model_config = model_config 60 | 61 | def _get_pre_processed_data_folder(self, work_folder): 62 | return os.path.join(work_folder, 'pre_processed_data') 63 | 64 | def _get_post_processed_data_folder(self, work_folder): 65 | return os.path.join(work_folder, 'post_processed_data') 66 | 67 | def _get_generated_data_folder(self, work_folder): 68 | return os.path.join(work_folder, 'generated_data') 69 | 70 | def _get_model_folder(self, work_folder): 71 | return os.path.join(work_folder, 'models') 72 | 73 | def _get_visualization_folder(self, work_folder): 74 | return os.path.join(work_folder, "visulization") 75 | 76 | def _get_pre_processed_data_log_folder(self, work_folder): 77 | return os.path.join(work_folder, 'logs', 'pre_processed_data') 78 | 79 | def _get_post_processed_data_log_folder(self, work_folder): 80 | return os.path.join(work_folder, 'logs', 'post_processed_data') 81 | 82 | def _get_generated_data_log_folder(self, work_folder): 83 | return os.path.join(work_folder, 'logs', 'generated_data') 84 | 85 | def _get_model_log_folder(self, work_folder): 86 | return os.path.join(work_folder, 'logs', 'models') 87 | 88 | def _pre_process(self, input_folder, output_folder, log_folder): 89 | if not self._check_folder(output_folder): 90 | return False 91 | if not self._check_folder(log_folder): 92 | return False 93 | return self._pre_post_processor.pre_process( 94 | input_folder=input_folder, 95 | output_folder=output_folder, 96 | log_folder=log_folder) 97 | 98 | def _post_process(self, input_folder, output_folder, 99 | pre_processed_data_folder, log_folder): 100 | if not self._check_folder(output_folder): 101 | return False 102 | if not self._check_folder(log_folder): 103 | return False 104 | return self._pre_post_processor.post_process( 105 | input_folder=input_folder, 106 | output_folder=output_folder, 107 | pre_processed_data_folder=pre_processed_data_folder, 108 | log_folder=log_folder) 109 | 110 | def _train(self, input_train_data_folder, output_model_folder, log_folder): 111 | if not self._check_folder(output_model_folder): 112 | return False 113 | if not self._check_folder(log_folder): 114 | return False 115 | return self._model_manager.train( 116 | input_train_data_folder=input_train_data_folder, 117 | output_model_folder=output_model_folder, 118 | log_folder=log_folder, 119 | create_new_model=self._model, 120 | model_config=self._model_config) 121 | 122 | def _generate(self, input_train_data_folder, 123 | input_model_folder, output_syn_data_folder, log_folder): 124 | if not self._check_folder(output_syn_data_folder): 125 | return False 126 | if not self._check_folder(log_folder): 127 | return False 128 | return self._model_manager.generate( 129 | input_train_data_folder=input_train_data_folder, 130 | input_model_folder=input_model_folder, 131 | output_syn_data_folder=output_syn_data_folder, 132 | log_folder=log_folder, 133 | create_new_model=self._model, 134 | model_config=self._model_config) 135 | 136 | def _check_folder(self, folder): 137 | if os.path.exists(folder): 138 | if self._overwrite: 139 | warnings.warn( 140 | f'{folder} already exists. ' 141 | 'You are overwriting the results.') 142 | return True 143 | else: 144 | print( 145 | f'{folder} already exists. To avoid overwriting the ' 146 | 'results, please change the work_folder') 147 | return False 148 | return False 149 | os.makedirs(folder) 150 | return True 151 | 152 | def generate(self, work_folder): 153 | work_folder = os.path.expanduser(work_folder) 154 | if not self._generate( 155 | input_train_data_folder=self._get_pre_processed_data_folder( 156 | work_folder), 157 | input_model_folder=self._get_model_folder(work_folder), 158 | output_syn_data_folder=self._get_generated_data_folder( 159 | work_folder), 160 | log_folder=self._get_generated_data_log_folder(work_folder)): 161 | print('Failed to generate synthetic data') 162 | return False 163 | if not self._post_process( 164 | input_folder=self._get_generated_data_folder(work_folder), 165 | output_folder=self._get_post_processed_data_folder( 166 | work_folder), 167 | log_folder=self._get_post_processed_data_log_folder( 168 | work_folder), 169 | pre_processed_data_folder=self._get_pre_processed_data_folder( 170 | work_folder)): 171 | print('Failed to post-process data') 172 | return False 173 | print(f'Generated data is at ' 174 | f'{self._get_post_processed_data_folder(work_folder)}') 175 | return True 176 | 177 | def train(self, work_folder): 178 | work_folder = os.path.expanduser(work_folder) 179 | if not self._pre_process( 180 | input_folder=self._ori_data_path, 181 | output_folder=self._get_pre_processed_data_folder(work_folder), 182 | log_folder=self._get_pre_processed_data_log_folder( 183 | work_folder)): 184 | print('Failed to pre-process data') 185 | return False 186 | if not self._train( 187 | input_train_data_folder=self._get_pre_processed_data_folder( 188 | work_folder), 189 | output_model_folder=self._get_model_folder(work_folder), 190 | log_folder=self._get_model_log_folder(work_folder)): 191 | print('Failed to train the model') 192 | return False 193 | return True 194 | 195 | def train_and_generate(self, work_folder): 196 | work_folder = os.path.expanduser(work_folder) 197 | if not self.train(work_folder): 198 | return False 199 | if not self.generate(work_folder): 200 | return False 201 | return True 202 | 203 | def visualize(self, work_folder): 204 | work_folder = os.path.expanduser(work_folder) 205 | os.makedirs(self._get_visualization_folder(work_folder), exist_ok=True) 206 | real_data = pd.read_csv( 207 | os.path.join( 208 | self._get_pre_processed_data_folder(work_folder), "raw.csv")) 209 | # Find synthetic data with the largest ID 210 | syn_data_list = [ 211 | f 212 | for f in os.listdir( 213 | self._get_post_processed_data_folder(work_folder)) 214 | if f.endswith('.csv')] 215 | id_pattern = re.compile(r'id-(\d+).csv') 216 | ids = [int(id_pattern.search(filename).group(1)) 217 | for filename in syn_data_list if id_pattern.search(filename)] 218 | # Find the largest ID 219 | largest_id = max(ids) 220 | # Find the filename corresponding to the largest ID 221 | filename_with_largest_id = [ 222 | filename for filename in syn_data_list 223 | if f'id-{largest_id}.csv' in filename][0] 224 | print( 225 | f'The filename with the largest ID is: {filename_with_largest_id}') 226 | synthetic_data = pd.read_csv(os.path.join( 227 | self._get_post_processed_data_folder(work_folder), 228 | filename_with_largest_id 229 | )) 230 | 231 | # Visualize the real data and synthetic data 232 | pre_post_processor_config = Config(self._config["global_config"]) 233 | pre_post_processor_config.update( 234 | self._config['pre_post_processor']['config']) 235 | sdmetrics_config = create_sdmetrics_config( 236 | pre_post_processor_config, 237 | comparison_type='both' 238 | ) 239 | my_report = QualityReport( 240 | config_dict=sdmetrics_config['config']) 241 | my_report.generate(real_data[synthetic_data.columns], synthetic_data, 242 | sdmetrics_config['metadata']) 243 | my_report.visualize() 244 | -------------------------------------------------------------------------------- /netshare/model_managers/__init__.py: -------------------------------------------------------------------------------- 1 | from .model_manager import ModelManager 2 | from .netshare_manager.netshare_manager import NetShareManager 3 | from .dg_model_manager import DGModelManager 4 | 5 | __all__ = ['ModelManager', 'NetShareManager', 'DGModelManager'] 6 | -------------------------------------------------------------------------------- /netshare/model_managers/dg_model_manager.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import netshare.ray as ray 3 | 4 | from .model_manager import ModelManager 5 | 6 | 7 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1) 8 | def _train_model(create_new_model, config, input_train_data_folder, 9 | output_model_folder, log_folder): 10 | model = create_new_model(config) 11 | model.train( 12 | input_train_data_folder=input_train_data_folder, 13 | output_model_folder=output_model_folder, 14 | log_folder=log_folder) 15 | return True 16 | 17 | 18 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1) 19 | def _generate_data(create_new_model, config, input_train_data_folder, 20 | input_model_folder, output_syn_data_folder, log_folder): 21 | config["given_data_attribute_flag"] = False 22 | config["save_without_chunk"] = True 23 | model = create_new_model(config) 24 | model.generate( 25 | input_train_data_folder=input_train_data_folder, 26 | input_model_folder=input_model_folder, 27 | output_syn_data_folder=output_syn_data_folder, 28 | log_folder=log_folder) 29 | return True 30 | 31 | 32 | class DGModelManager(ModelManager): 33 | 34 | def _train(self, input_train_data_folder, output_model_folder, log_folder, 35 | create_new_model, model_config): 36 | print(f"{self.__class__.__name__}.{inspect.stack()[0][3]}") 37 | ray.get(_train_model.remote( 38 | create_new_model=create_new_model, 39 | config=model_config, 40 | input_train_data_folder=input_train_data_folder, 41 | output_model_folder=output_model_folder, 42 | log_folder=log_folder)) 43 | return True 44 | 45 | def _generate(self, input_train_data_folder, input_model_folder, 46 | output_syn_data_folder, log_folder, create_new_model, 47 | model_config): 48 | print(f"{self.__class__.__name__}.{inspect.stack()[0][3]}") 49 | ray.get(_generate_data.remote( 50 | create_new_model=create_new_model, 51 | config=model_config, 52 | input_train_data_folder=input_train_data_folder, 53 | input_model_folder=input_model_folder, 54 | output_syn_data_folder=output_syn_data_folder, 55 | log_folder=log_folder)) 56 | return True 57 | -------------------------------------------------------------------------------- /netshare/model_managers/model_manager.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import os 3 | 4 | from netshare.utils import Tee 5 | 6 | 7 | class ModelManager(ABC): 8 | def __init__(self, config): 9 | self._config = config 10 | 11 | @abstractmethod 12 | def _train(self, input_train_data_folder, output_model_folder, 13 | log_folder, create_new_model, model_config): 14 | ... 15 | 16 | @abstractmethod 17 | def _generate(self, 18 | input_train_data_folder, input_model_folder, 19 | output_syn_data_folder, log_folder, 20 | create_new_model, model_config): 21 | ... 22 | 23 | def train(self, input_train_data_folder, output_model_folder, log_folder, 24 | create_new_model, model_config): 25 | stdout_log_path = os.path.join(log_folder, 'train.stdout.log') 26 | stderr_log_path = os.path.join(log_folder, 'train.stderr.log') 27 | with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path): 28 | return self._train( 29 | input_train_data_folder=input_train_data_folder, 30 | output_model_folder=output_model_folder, 31 | log_folder=log_folder, 32 | create_new_model=create_new_model, 33 | model_config=model_config) 34 | 35 | def generate(self, 36 | input_train_data_folder, input_model_folder, 37 | output_syn_data_folder, log_folder, 38 | create_new_model, model_config): 39 | stdout_log_path = os.path.join(log_folder, 'generate.stdout.log') 40 | stderr_log_path = os.path.join(log_folder, 'generate.stderr.log') 41 | with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path): 42 | return self._generate( 43 | input_train_data_folder=input_train_data_folder, 44 | input_model_folder=input_model_folder, 45 | output_syn_data_folder=output_syn_data_folder, 46 | log_folder=log_folder, 47 | create_new_model=create_new_model, 48 | model_config=model_config) 49 | -------------------------------------------------------------------------------- /netshare/model_managers/netshare_manager/generate_helper.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import subprocess 3 | import sys 4 | import time 5 | import os 6 | import json 7 | import importlib 8 | import random 9 | import pickle 10 | import pandas as pd 11 | import socket 12 | import struct 13 | import ipaddress 14 | import argparse 15 | 16 | import numpy as np 17 | import pandas as pd 18 | 19 | import netshare.ray as ray 20 | from pathlib import Path 21 | from tqdm import tqdm 22 | from scapy.all import IP, ICMP, TCP, UDP 23 | from scapy.all import wrpcap 24 | from scipy.stats import rankdata 25 | from pathlib import Path 26 | 27 | 28 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1) 29 | def _generate_session( 30 | create_new_model, 31 | configs, 32 | config_idx, 33 | log_folder): 34 | config = configs[config_idx] 35 | config["given_data_attribute_flag"] = False 36 | model = create_new_model(config) 37 | model.generate( 38 | input_train_data_folder=config["dataset"], 39 | input_model_folder=config["result_folder"], 40 | output_syn_data_folder=config["eval_root_folder"], 41 | log_folder=log_folder) 42 | 43 | 44 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1) 45 | def _generate_attr( 46 | create_new_model, 47 | configs, 48 | config_idx, 49 | log_folder): 50 | config = configs[config_idx] 51 | config["given_data_attribute_flag"] = False 52 | model = create_new_model(config) 53 | model.generate( 54 | input_train_data_folder=config["dataset"], 55 | input_model_folder=config["result_folder"], 56 | output_syn_data_folder=config["eval_root_folder"], 57 | log_folder=log_folder) 58 | 59 | 60 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1) 61 | def _merge_attr( 62 | attr_raw_npz_folder, 63 | config_group, 64 | configs 65 | ): 66 | num_chunks = len(config_group["config_ids"]) 67 | chunk0_idx = config_group["config_ids"][0] 68 | chunk0_config = configs[chunk0_idx] 69 | print("chunk0 config:", configs[chunk0_idx]) 70 | 71 | # Find flow tag starting point 72 | with open(os.path.join(chunk0_config["dataset"], "data_attribute_fields.pkl"), 'rb') as f: 73 | data_attribute_fields = pickle.load(f) 74 | bit_idx_flagstart = 0 75 | for field_idx, field in enumerate(data_attribute_fields): 76 | if field.name != "startFromThisChunk": 77 | bit_idx_flagstart += field.dim_x 78 | else: 79 | break 80 | print("bit_idx_flagstart:", bit_idx_flagstart) 81 | 82 | attr_clean_npz_folder = os.path.join( 83 | str(Path(attr_raw_npz_folder).parents[0]), "attr_clean" 84 | ) 85 | os.makedirs(attr_clean_npz_folder, exist_ok=True) 86 | 87 | dict_chunkid_attr = {} 88 | dict_chunkid_attr_discrete = {} 89 | for chunkid in tqdm(range(num_chunks)): 90 | dict_chunkid_attr[chunkid] = [] 91 | dict_chunkid_attr_discrete[chunkid] = [] 92 | 93 | for chunkid in tqdm(range(num_chunks)): 94 | n_flows_startFromThisEpoch = 0 95 | 96 | if not os.path.exists( 97 | os.path.join( 98 | attr_raw_npz_folder, 99 | "chunk_id-{}.npz".format(chunkid)) 100 | ): 101 | print( 102 | "{} not exists...".format( 103 | os.path.join( 104 | attr_raw_npz_folder, 105 | "chunk_id-{}.npz".format(chunkid)) 106 | ) 107 | ) 108 | continue 109 | 110 | raw_attr_chunk = np.load( 111 | os.path.join( 112 | attr_raw_npz_folder, 113 | "chunk_id-{}.npz".format(chunkid)) 114 | )["data_attribute"] 115 | raw_attr_discrete_chunk = np.load( 116 | os.path.join( 117 | attr_raw_npz_folder, 118 | "chunk_id-{}.npz".format(chunkid)) 119 | )["data_attribute_discrete"] 120 | 121 | if num_chunks > 1: 122 | for row_idx, row in enumerate(raw_attr_chunk): 123 | # if row[bit_idx_flagstart] < row[bit_idx_flagstart+1]: 124 | if ( 125 | row[bit_idx_flagstart] < row[bit_idx_flagstart + 1] 126 | and row[bit_idx_flagstart + 2 * chunkid + 2] 127 | < row[bit_idx_flagstart + 2 * chunkid + 3] 128 | ): 129 | # this chunk 130 | row_this_chunk = list( 131 | copy.deepcopy(row)[ 132 | :bit_idx_flagstart]) 133 | row_this_chunk += [0.0, 1.0] 134 | row_this_chunk += [1.0, 0.0] * (chunkid + 1) 135 | for i in range(chunkid + 1, num_chunks): 136 | if ( 137 | row[bit_idx_flagstart + 2 * i + 2] 138 | < row[bit_idx_flagstart + 2 * i + 3] 139 | ): 140 | row_this_chunk += [0.0, 1.0] 141 | else: 142 | row_this_chunk += [1.0, 0.0] 143 | # dict_chunkid_attr[chunkid].append(row_this_chunk) 144 | dict_chunkid_attr[chunkid].append(row) 145 | dict_chunkid_attr_discrete[chunkid].append( 146 | raw_attr_discrete_chunk[row_idx]) 147 | 148 | # following chunks 149 | # row_following_chunk = list(copy.deepcopy(row)[:bit_idx_flagstart]) 150 | # row_following_chunk += [1.0, 0.0]*(1+NUM_CHUNKS) 151 | n_flows_startFromThisEpoch += 1 152 | row_following_chunk = list(copy.deepcopy(row)) 153 | row_following_chunk[bit_idx_flagstart] = 1.0 154 | row_following_chunk[bit_idx_flagstart + 1] = 0.0 155 | 156 | row_discrete_following_chunk = list( 157 | copy.deepcopy(raw_attr_discrete_chunk[row_idx])) 158 | row_discrete_following_chunk[bit_idx_flagstart] = 1.0 159 | row_discrete_following_chunk[bit_idx_flagstart + 1] = 0.0 160 | 161 | for i in range(chunkid + 1, num_chunks): 162 | if ( 163 | row[bit_idx_flagstart + 2 * i + 2] 164 | < row[bit_idx_flagstart + 2 * i + 3] 165 | ): 166 | dict_chunkid_attr[i].append(row_following_chunk) 167 | dict_chunkid_attr_discrete[i].append( 168 | row_discrete_following_chunk) 169 | # dict_chunkid_attr[i].append(row) 170 | else: 171 | dict_chunkid_attr[chunkid] = raw_attr_chunk 172 | dict_chunkid_attr_discrete[chunkid] = raw_attr_discrete_chunk 173 | 174 | print( 175 | "n_flows_startFromThisEpoch / total flows: {}/{}".format( 176 | n_flows_startFromThisEpoch, raw_attr_chunk.shape[0] 177 | ) 178 | ) 179 | 180 | print("Saving merged attrs...") 181 | n_merged_attrs = 0 182 | for chunkid, attr_clean in dict_chunkid_attr.items(): 183 | print("chunk {}: {} flows".format(chunkid, len(attr_clean))) 184 | n_merged_attrs += len(attr_clean) 185 | np.savez( 186 | os.path.join( 187 | attr_clean_npz_folder, "chunk_id-{}.npz".format(chunkid)), 188 | data_attribute=np.asarray(attr_clean), 189 | data_attribute_discrete=np.asarray( 190 | dict_chunkid_attr_discrete[chunkid])) 191 | 192 | print("n_merged_attrs:", n_merged_attrs) 193 | 194 | 195 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1) 196 | # @ray.remote(scheduling_strategy="DEFAULT", max_calls=1) 197 | def _generate_given_attr(create_new_model, configs, config_idx, 198 | log_folder): 199 | 200 | config = configs[config_idx] 201 | config["given_data_attribute_flag"] = True 202 | model = create_new_model(config) 203 | model.generate( 204 | input_train_data_folder=config["dataset"], 205 | input_model_folder=config["result_folder"], 206 | output_syn_data_folder=config["eval_root_folder"], 207 | log_folder=log_folder) 208 | -------------------------------------------------------------------------------- /netshare/model_managers/netshare_manager/netshare_manager.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | from ..model_manager import ModelManager 4 | from .train_helper import _train_specific_config_group 5 | from .generate_helper import _generate_attr, _merge_attr, _generate_given_attr, _generate_session 6 | from .netshare_util import _load_config, _configs2configsgroup 7 | import netshare.ray as ray 8 | import os 9 | import time 10 | import json 11 | 12 | import pandas as pd 13 | 14 | 15 | class NetShareManager(ModelManager): 16 | def _train(self, input_train_data_folder, output_model_folder, log_folder, 17 | create_new_model, model_config): 18 | print(f"{self.__class__.__name__}.{inspect.stack()[0][3]}") 19 | 20 | configs = _load_config( 21 | config_dict={ 22 | **self._config, 23 | **model_config}, 24 | input_train_data_folder=input_train_data_folder, 25 | output_model_folder=output_model_folder) 26 | 27 | configs, config_group_list = _configs2configsgroup( 28 | configs=configs, 29 | generation_flag=False) 30 | print(config_group_list) 31 | with open(os.path.join(output_model_folder, "configs_train.json"), 'w') as f: 32 | json.dump({ 33 | "configs": configs, 34 | "config_group_list": config_group_list 35 | }, f, indent=4) 36 | 37 | objs = [] 38 | for config_group_id, config_group in enumerate(config_group_list): 39 | objs.append( 40 | _train_specific_config_group.remote( 41 | create_new_model=create_new_model, 42 | config_group_id=config_group_id, 43 | config_group=config_group, 44 | configs=configs, 45 | input_train_data_folder=input_train_data_folder, 46 | output_model_folder=output_model_folder, 47 | log_folder=log_folder) 48 | ) 49 | results = ray.get(objs) 50 | return results 51 | 52 | def _generate( 53 | self, input_train_data_folder, input_model_folder, 54 | output_syn_data_folder, log_folder, create_new_model, model_config): 55 | configs = _load_config( 56 | config_dict={ 57 | **self._config, 58 | **model_config}, 59 | input_train_data_folder=input_train_data_folder, 60 | output_model_folder=input_model_folder) 61 | 62 | configs, config_group_list = _configs2configsgroup( 63 | configs=configs, 64 | generation_flag=True, 65 | output_syn_data_folder=output_syn_data_folder 66 | ) 67 | 68 | with open(os.path.join(output_syn_data_folder, "configs_generate.json"), 'w') as f: 69 | json.dump({ 70 | "configs": configs, 71 | "config_group_list": config_group_list 72 | }, f, indent=4) 73 | 74 | print("Start generating attributes ...") 75 | if configs[0]["n_chunks"] > 1: 76 | objs = [] 77 | for config_idx, config in enumerate(configs): 78 | objs.append( 79 | _generate_attr.remote( 80 | create_new_model=create_new_model, 81 | configs=configs, 82 | config_idx=config_idx, 83 | log_folder=log_folder)) 84 | _ = ray.get(objs) 85 | time.sleep(10) 86 | print("Finish generating attributes") 87 | 88 | print("Start merging attributes ...") 89 | objs = [] 90 | for config_group in config_group_list: 91 | chunk0_idx = config_group["config_ids"][0] 92 | eval_root_folder = configs[chunk0_idx]["eval_root_folder"] 93 | 94 | objs.append( 95 | _merge_attr.remote( 96 | attr_raw_npz_folder=os.path.join( 97 | eval_root_folder, "attr_raw"), 98 | config_group=config_group, 99 | configs=configs) 100 | ) 101 | _ = ray.get(objs) 102 | time.sleep(10) 103 | print("Finish merging attributes...") 104 | 105 | print("Start generating features given attributes ...") 106 | objs = [] 107 | for config_idx, config in enumerate(configs): 108 | objs.append( 109 | _generate_given_attr.remote( 110 | create_new_model=create_new_model, 111 | configs=configs, 112 | config_idx=config_idx, 113 | log_folder=log_folder)) 114 | _ = ray.get(objs) 115 | time.sleep(10) 116 | else: 117 | objs = [] 118 | for config_idx, config in enumerate(configs): 119 | objs.append( 120 | _generate_session.remote( 121 | create_new_model=create_new_model, 122 | configs=configs, 123 | config_idx=config_idx, 124 | log_folder=log_folder)) 125 | _ = ray.get(objs) 126 | print("Finish generating features given attributes ...") 127 | 128 | return True 129 | -------------------------------------------------------------------------------- /netshare/model_managers/netshare_manager/netshare_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | from config_io import Config 5 | 6 | 7 | def _load_config(config_dict, input_train_data_folder, output_model_folder): 8 | config_pre_expand = Config(config_dict) 9 | 10 | # TODO: add preprocessing logic for DoppelGANger (single-chunk?) 11 | config_pre_expand["dataset"] = [] 12 | config_pre_expand["dataset_expand"] = True 13 | n_valid_chunks = 0 14 | for chunk_id in range(config_pre_expand["n_chunks"]): 15 | dataset_folder = os.path.join( 16 | input_train_data_folder, f"chunkid-{chunk_id}") 17 | if os.path.exists(dataset_folder) and os.path.isdir(dataset_folder): 18 | config_pre_expand["dataset"].append(dataset_folder) 19 | n_valid_chunks += 1 20 | config_pre_expand["n_chunks"] = n_valid_chunks 21 | print("Number of valid chunks:", config_pre_expand["n_chunks"]) 22 | 23 | config_post_expand = config_pre_expand.expand() 24 | print( 25 | f"Number of configurations after expanded: {len(config_post_expand)}") 26 | 27 | configs = [] 28 | for config_ in config_post_expand: 29 | sub_result_folder = os.path.join( 30 | os.path.basename(config_["dataset"]), 31 | ",".join("{}-{}".format(k, os.path.basename(str(v))) 32 | for k, v in config_.items() 33 | if f"{k}_expand" in config_.keys() and k != "dataset") 34 | ) 35 | config_["sub_result_folder"] = sub_result_folder 36 | config_["result_folder"] = os.path.join( 37 | output_model_folder, sub_result_folder) 38 | 39 | # sanity check 40 | if config_["pretrain_non_dp"] and \ 41 | ((config_["dp_noise_multiplier"] is not None) or 42 | (config_["dp_l2_norm_clip"] is not None)): 43 | raise ValueError( 44 | "pretrain_non_DP can only be used for non-DP case!") 45 | 46 | if config_["pretrain_non_dp"] and \ 47 | config_["pretrain_non_dp_reduce_time"] is None: 48 | raise ValueError( 49 | "pretrain_non_dp=True, " 50 | "then pretrain_non_dp_reduce_time must be set!") 51 | 52 | if not config_["pretrain_non_dp"] and \ 53 | config_["pretrain_non_dp_reduce_time"] is not None: 54 | raise ValueError( 55 | "pretrain_non_dp=False, " 56 | "pretrain_non_dp_reduce_time does not need to be set!") 57 | 58 | if config_["pretrain_non_dp"] and config_["pretrain_dp"]: 59 | raise ValueError( 60 | "Only one of pretrain_non_DP and pretrain_DP can be True!") 61 | 62 | if config_["pretrain_dp"] and config_["pretrain_dir"] is None: 63 | raise ValueError( 64 | "You are using DP with pretrained public model, " 65 | "pretrain_dir must be set to the pretrained public model " 66 | "checkpoint directory!") 67 | 68 | configs.append(config_) 69 | 70 | return configs 71 | 72 | 73 | def get_configid_from_kv(configs, k, v): 74 | for idx, config in enumerate(configs): 75 | if config[k] == v: 76 | return idx 77 | raise ValueError("{}: {} not found in configs!".format(k, v)) 78 | 79 | 80 | def _configs2configsgroup( 81 | configs, 82 | generation_flag=False, 83 | output_syn_data_folder=None): 84 | ''' 85 | # convert a list of configurations to a grouped dictionary 86 | # for training purpose 87 | # key : value 88 | # dp : bool 89 | # dp_noise_multiplier: float 90 | # pretrain: bool 91 | # config_ids: list 92 | ''' 93 | if generation_flag and output_syn_data_folder is None: 94 | raise ValueError("Generation phase: " 95 | "output_syn_data_folder must be specified") 96 | 97 | config_id_list_victim = [i for i in range(len(configs))] 98 | config_group_list = [] 99 | 100 | for config_id, config in enumerate(configs): 101 | if config_id in config_id_list_victim: 102 | config_group = {} 103 | config_group["dp_noise_multiplier"] = config["dp_noise_multiplier"] 104 | config_group["dp"] = (config["dp_noise_multiplier"] is not None) 105 | config_group["pretrain"] = ( 106 | config["pretrain_non_dp"] or config["pretrain_dp"]) 107 | config_group["config_ids"] = [] 108 | 109 | num_chunks = config["n_chunks"] 110 | for chunk_idx in range(num_chunks): 111 | config_id_ = get_configid_from_kv( 112 | configs=configs, 113 | k="result_folder", 114 | v=re.sub( 115 | 'chunkid-[0-9]+', 116 | 'chunkid-{}'.format(chunk_idx), 117 | config["result_folder"])) 118 | config_group["config_ids"].append(config_id_) 119 | config_id_list_victim.remove(config_id_) 120 | 121 | config_group_list.append(config_group) 122 | 123 | # sanity check 124 | assert len(config_id_list_victim) == 0 125 | config_ids_check = [] 126 | for config_group in config_group_list: 127 | config_ids_check += config_group["config_ids"] 128 | assert set(config_ids_check) == set([i for i in range(len(configs))]) 129 | 130 | # add pretrain_dir etc. to the original configs 131 | for config_group in config_group_list: 132 | if not config_group["pretrain"]: 133 | for config_id in config_group["config_ids"]: 134 | configs[config_id]["restore"] = False 135 | else: 136 | if not config_group["dp"]: 137 | for chunk_id, config_id in enumerate( 138 | config_group["config_ids"]): 139 | if chunk_id == 0: 140 | chunk0_idx = config_id 141 | configs[config_id]["restore"] = False 142 | epoch_range = list( 143 | range( 144 | configs[config_id]["epoch_checkpoint_freq"]-1, 145 | configs[config_id]["epochs"], 146 | configs[config_id]["epoch_checkpoint_freq"])) 147 | epoch_range.reverse() 148 | 149 | pretrain_dir = None 150 | # use last available ckpt 151 | if configs[config_id]["skip_chunk0_train"]: 152 | last_epoch_found = False 153 | for epoch_id in epoch_range: 154 | if last_epoch_found: 155 | break 156 | ckpt_dir = os.path.join( 157 | configs[config_id]["result_folder"], 158 | "checkpoint", 159 | "epoch_id-{}.pt".format(epoch_id) 160 | ) 161 | if os.path.exists(ckpt_dir): 162 | last_epoch_found = True 163 | 164 | if not last_epoch_found: 165 | raise ValueError( 166 | "Skipping chunk0 training but " 167 | "chunk0 has no available ckpt at {}! " 168 | "Please move ckpts into the " 169 | "corresponding folder.".format( 170 | configs[config_id]["result_folder"])) 171 | else: 172 | pretrain_dir = ckpt_dir 173 | else: 174 | if os.path.exists(os.path.join( 175 | configs[config_id]["result_folder"], 176 | "checkpoint") 177 | ) and not generation_flag: 178 | raise ValueError( 179 | "Chunk0 training NOT skipped " 180 | "but ckpts already exist! " 181 | "Please change your working folder " 182 | "or clean up the ckpt folder " 183 | "to continute training from scratch.") 184 | 185 | pretrain_dir = os.path.join( 186 | configs[config_id]["result_folder"], 187 | "checkpoint", 188 | "epoch_id-{}.pt".format( 189 | epoch_range[0]) 190 | ) 191 | 192 | configs[config_id]["pretrain_dir"] = pretrain_dir 193 | 194 | else: 195 | configs[config_id]["restore"] = True 196 | configs[config_id]["pretrain_dir"] = pretrain_dir 197 | configs[config_id]["epochs"] = int( 198 | configs[config_id]["epochs"] / 199 | configs[config_id]["pretrain_non_dp_reduce_time"]) 200 | 201 | else: 202 | for chunk_id, config_id in enumerate( 203 | config_group["config_ids"]): 204 | configs[config_id]["restore"] = True 205 | 206 | # add chunk_id and eval_root_folder for generation related 207 | if generation_flag: 208 | for config_group in config_group_list: 209 | chunk0_idx = config_group["config_ids"][0] 210 | eval_root_folder = os.path.join( 211 | output_syn_data_folder, 212 | re.sub( 213 | 'chunkid-0', 214 | '', 215 | configs[chunk0_idx]["sub_result_folder"]).strip("/")) 216 | for chunk_id, config_id in enumerate(config_group["config_ids"]): 217 | configs[config_id]["chunk_id"] = chunk_id 218 | configs[config_id]["eval_root_folder"] = eval_root_folder 219 | 220 | for config in configs: 221 | os.makedirs(config["result_folder"], exist_ok=True) 222 | if generation_flag: 223 | os.makedirs(config["eval_root_folder"], exist_ok=True) 224 | 225 | return configs, config_group_list 226 | -------------------------------------------------------------------------------- /netshare/model_managers/netshare_manager/train_helper.py: -------------------------------------------------------------------------------- 1 | import netshare.ray as ray 2 | import os 3 | 4 | 5 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1) 6 | def _launch_one_chunk_training( 7 | create_new_model, configs, config_idx, input_train_data_folder, 8 | output_model_folder, log_folder): 9 | model = create_new_model(configs[config_idx]) 10 | obj = model.train(input_train_data_folder, output_model_folder, log_folder) 11 | return obj 12 | 13 | 14 | def _launch_other_chunks_training( 15 | create_new_model, configs, config_ids, input_train_data_folder, 16 | output_model_folder, log_folder): 17 | chunk0_idx = config_ids[0] 18 | if configs[chunk0_idx]["skip_chunk0_train"] and configs[chunk0_idx][ 19 | "pretrain_dir"] is None: 20 | raise ValueError( 21 | "Skipping chunk0 training but chunk0 has no available ckpt!" 22 | "Please move ckpts into the corresponding folder.") 23 | objs = [] 24 | for config_idx in config_ids[1:]: 25 | # sanity check 26 | if not os.path.exists(configs[config_idx]["pretrain_dir"]): 27 | raise ValueError( 28 | f"Pretrain_dir {configs[config_idx]['pretrain_dir']} does not exist!") 29 | objs.append( 30 | _launch_one_chunk_training.remote( 31 | create_new_model, 32 | configs, 33 | config_idx, 34 | input_train_data_folder, 35 | output_model_folder, 36 | log_folder)) 37 | 38 | results = ray.get(objs) 39 | return results 40 | 41 | 42 | def _launch_all_chunks_training( 43 | create_new_model, configs, config_ids, input_train_data_folder, 44 | output_model_folder, log_folder): 45 | objs = [] 46 | for config_idx in config_ids: 47 | # sanity check 48 | if not os.path.exists(configs[config_idx]["pretrain_dir"]): 49 | raise ValueError("Pretrain_dir {} does not exist!") 50 | objs.append( 51 | _launch_one_chunk_training.remote( 52 | create_new_model, 53 | configs, 54 | config_idx, 55 | input_train_data_folder, 56 | output_model_folder, 57 | log_folder)) 58 | 59 | results = ray.get(objs) 60 | return results 61 | 62 | 63 | @ray.remote(scheduling_strategy="SPREAD") 64 | def _train_specific_config_group( 65 | create_new_model, 66 | config_group_id, 67 | config_group, 68 | configs, 69 | input_train_data_folder, 70 | output_model_folder, 71 | log_folder): 72 | print( 73 | "Config group {}: DP: {}, pretrain: {}".format( 74 | config_group_id, config_group["dp"], config_group["pretrain"] 75 | ) 76 | ) 77 | config_ids = config_group["config_ids"] 78 | if config_group["dp"] == False and config_group["pretrain"] == True: 79 | chunk0_idx = config_ids[0] 80 | if configs[chunk0_idx]["skip_chunk0_train"] == True: 81 | print("Skipping chunk0 training...") 82 | else: 83 | print("Start launching chunk0 experiments...") 84 | # launch first chunk 85 | config_idx = config_ids[0] 86 | result = ray.get( 87 | _launch_one_chunk_training.remote( 88 | create_new_model, 89 | configs, 90 | config_idx, 91 | input_train_data_folder, 92 | output_model_folder, 93 | log_folder)) 94 | 95 | print("Finish launching chunk0 experiments ...") 96 | 97 | if len(configs) > 1: 98 | print( 99 | f"Start waiting for other chunks from config_group_id {config_group_id} experiments finished ...") 100 | results = _launch_other_chunks_training( 101 | create_new_model, 102 | configs, 103 | config_ids, 104 | input_train_data_folder, 105 | output_model_folder, 106 | log_folder) 107 | print(f"Other chunks from config_group_id {config_group_id} training finished") 108 | 109 | else: 110 | print("Launching all chunks experiments...") 111 | # Haven't been tested 112 | results = _launch_all_chunks_training( 113 | create_new_model, 114 | configs, 115 | config_ids, 116 | input_train_data_folder, 117 | output_model_folder, 118 | log_folder) 119 | 120 | return True 121 | -------------------------------------------------------------------------------- /netshare/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import Model 2 | from .doppelganger_torch_model import DoppelGANgerTorchModel 3 | 4 | __all__ = ['Model', 'DoppelGANgerTorchModel'] 5 | -------------------------------------------------------------------------------- /netshare/models/doppelganger_torch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netsharecmu/NetShare/af026037a88db486069209e2258e11c2df1b93e2/netshare/models/doppelganger_torch/__init__.py -------------------------------------------------------------------------------- /netshare/models/doppelganger_torch/load_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import numpy as np 4 | import pickle 5 | 6 | 7 | def load_data(path, sample_len, flag="train"): 8 | 9 | data_npz = np.load(os.path.join(path, "data_{}.npz".format(flag))) 10 | with open(os.path.join(path, "data_feature_output.pkl"), "rb") as f: 11 | data_feature_outputs = pickle.load(f) 12 | with open(os.path.join(path, "data_attribute_output.pkl"), "rb") as f: 13 | data_attribute_outputs = pickle.load(f) 14 | 15 | data_feature = data_npz["data_feature"] 16 | data_attribute = data_npz["data_attribute"] 17 | data_gen_flag = data_npz["data_gen_flag"] 18 | 19 | # Append data_feature and data_gen_flag to multiple of sample_len 20 | timeseries_len = data_feature.shape[1] 21 | ceil_timeseries_len = math.ceil(timeseries_len / sample_len) * sample_len 22 | data_feature = np.pad( 23 | data_feature, 24 | pad_width=((0, 0), 25 | (0, ceil_timeseries_len - timeseries_len), 26 | (0, 0)), 27 | mode='constant', constant_values=0) 28 | data_gen_flag = np.pad( 29 | data_gen_flag, 30 | pad_width=((0, 0), 31 | (0, ceil_timeseries_len - timeseries_len)), 32 | mode='constant', constant_values=0) 33 | 34 | return ( 35 | data_feature, 36 | data_attribute, 37 | data_gen_flag, 38 | data_feature_outputs, 39 | data_attribute_outputs, 40 | ) 41 | -------------------------------------------------------------------------------- /netshare/models/doppelganger_torch/privacy_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Meta Platforms, Inc. and affiliates. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ 17 | Command-line script for computing privacy of a model trained with DP-SGD. 18 | The script applies the RDP accountant to estimate privacy budget of an iterated 19 | Sampled Gaussian Mechanism. 20 | The code is mainly based on Google's TF Privacy: 21 | https://github.com/tensorflow/privacy/blob/master/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy.py 22 | Example: 23 | To call this script from command line, you can enter: 24 | $ python -m opacus.scripts.compute_dp_sgd_privacy --epochs=3 --delta=1e-5 --sample-rate 0.01 --noise-multiplier 1.0 --alphas 2 5 10 20 100 25 | DP-SGD with 26 | - sampling rate = 1%, 27 | - noise_multiplier = 1.0, 28 | - iterated over 300 steps 29 | satisfies differential privacy with 30 | - epsilon = 2.39, 31 | - delta = 1e-05. 32 | The optimal alpha is 5.0. 33 | """ 34 | import argparse 35 | import math 36 | from typing import List, Tuple 37 | 38 | from opacus.accountants.analysis.rdp import compute_rdp, get_privacy_spent 39 | 40 | 41 | def _apply_dp_sgd_analysis( 42 | *, 43 | sample_rate: float, 44 | noise_multiplier: float, 45 | steps: int, 46 | alphas: List[float], 47 | delta: float, 48 | verbose: bool = True, 49 | ) -> Tuple[float, float]: 50 | """ 51 | Computes the privacy Epsilon at a given delta via RDP accounting and 52 | converting to an (epsilon, delta) guarantee for a target Delta. 53 | Args: 54 | sample_rate : The sample rate in SGD 55 | noise_multiplier : The ratio of the standard deviation of the Gaussian 56 | noise to the L2-sensitivity of the function to which the noise is added 57 | steps : The number of steps 58 | alphas : A list of RDP orders 59 | delta : Target delta 60 | verbose : If enabled, will print the results of DP-SGD analysis 61 | Returns: 62 | Pair of privacy loss epsilon and optimal order alpha 63 | """ 64 | rdp = compute_rdp( 65 | q=sample_rate, noise_multiplier=noise_multiplier, steps=steps, orders=alphas 66 | ) 67 | eps, opt_alpha = get_privacy_spent(orders=alphas, rdp=rdp, delta=delta) 68 | 69 | if verbose: 70 | print( 71 | f"DP-SGD with\n\tsampling rate = {100 * sample_rate:.3g}%," 72 | f"\n\tnoise_multiplier = {noise_multiplier}," 73 | f"\n\titerated over {steps} steps,\nsatisfies " 74 | f"differential privacy with\n\tepsilon = {eps:.3g}," 75 | f"\n\tdelta = {delta}." 76 | f"\nThe optimal alpha is {opt_alpha}." 77 | ) 78 | 79 | if opt_alpha == max(alphas) or opt_alpha == min(alphas): 80 | print( 81 | "The privacy estimate is likely to be improved by expanding " 82 | "the set of alpha orders." 83 | ) 84 | return eps, opt_alpha 85 | 86 | 87 | def compute_dp_sgd_privacy( 88 | *, 89 | sample_rate: float, 90 | noise_multiplier: float, 91 | epochs: int, 92 | delta: float, 93 | alphas: List[float], 94 | verbose: bool = True, 95 | ) -> Tuple[float, float]: 96 | """ 97 | Performs the DP-SGD privacy analysis. 98 | Finds sample rate and number of steps based on the input parameters, and calls 99 | DP-SGD privacy analysis to find the privacy loss epsilon and optimal order alpha. 100 | Args: 101 | sample_rate : probability of each sample from the dataset to be selected for a next batch 102 | noise_multiplier : The ratio of the standard deviation of the Gaussian noise 103 | to the L2-sensitivity of the function to which the noise is added 104 | epochs : Number of epochs 105 | delta : Target delta 106 | alphas : A list of RDP orders 107 | verbose : If enabled, will print the results of DP-SGD analysis 108 | Returns: 109 | Pair of privacy loss epsilon and optimal order alpha 110 | Raises: 111 | ValueError 112 | When batch size is greater than sample size 113 | """ 114 | if sample_rate > 1: 115 | raise ValueError("sample_rate must be no greater than 1") 116 | steps = epochs * math.ceil(1 / sample_rate) 117 | 118 | return _apply_dp_sgd_analysis( 119 | sample_rate=sample_rate, 120 | noise_multiplier=noise_multiplier, 121 | steps=steps, 122 | alphas=alphas, 123 | delta=delta, 124 | verbose=verbose, 125 | ) 126 | 127 | 128 | def main(): 129 | parser = argparse.ArgumentParser( 130 | description="Estimate privacy of a model trained with DP-SGD using RDP accountant", 131 | ) 132 | parser.add_argument( 133 | "-r", 134 | "--sample-rate", 135 | type=float, 136 | required=True, 137 | help="Input sample rate (probability of each sample from the dataset to be selected for a next batch)", 138 | ) 139 | parser.add_argument( 140 | "-n", 141 | "--noise-multiplier", 142 | type=float, 143 | required=True, 144 | help="Noise multiplier", 145 | ) 146 | parser.add_argument( 147 | "-e", 148 | "--epochs", 149 | type=int, 150 | required=True, 151 | help="Number of epochs to train", 152 | ) 153 | parser.add_argument( 154 | "-d", "--delta", type=float, default=1e-5, help="Targeted delta (default: 1e-5)" 155 | ) 156 | parser.add_argument( 157 | "-a", 158 | "--alphas", 159 | action="store", 160 | dest="alphas", 161 | type=float, 162 | nargs="+", 163 | default=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), 164 | help="List of alpha values (alpha orders of Renyi-DP evaluation). " 165 | "A default list is provided. Else, space separated numbers. E.g.," 166 | "-a 10 100", 167 | ) 168 | 169 | args = parser.parse_args() 170 | 171 | compute_dp_sgd_privacy( 172 | sample_rate=args.sample_rate, 173 | noise_multiplier=args.noise_multiplier, 174 | epochs=args.epochs, 175 | delta=args.delta, 176 | alphas=args.alphas, 177 | ) 178 | 179 | 180 | if __name__ == "__main__": 181 | main() 182 | -------------------------------------------------------------------------------- /netshare/models/doppelganger_torch/util.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from netshare.utils import OutputType, Output, Normalization 3 | import numpy as np 4 | import matplotlib 5 | 6 | matplotlib.use("Agg") 7 | 8 | 9 | def renormalize_per_sample( 10 | data_feature, 11 | data_attribute, 12 | data_feature_outputs, 13 | data_attribute_outputs, 14 | gen_flags, 15 | num_real_attribute, 16 | ): 17 | attr_dim = 0 18 | for i in range(num_real_attribute): 19 | attr_dim += data_attribute_outputs[i].dim 20 | attr_dim_cp = attr_dim 21 | 22 | fea_dim = 0 23 | for output in data_feature_outputs: 24 | if output.type_ == OutputType.CONTINUOUS: 25 | for _ in range(output.dim): 26 | max_plus_min_d_2 = data_attribute[:, attr_dim] 27 | max_minus_min_d_2 = data_attribute[:, attr_dim + 1] 28 | attr_dim += 2 29 | 30 | max_ = max_plus_min_d_2 + max_minus_min_d_2 31 | min_ = max_plus_min_d_2 - max_minus_min_d_2 32 | 33 | max_ = np.expand_dims(max_, axis=1) 34 | min_ = np.expand_dims(min_, axis=1) 35 | 36 | if output.normalization == Normalization.MINUSONE_ONE: 37 | data_feature[:, :, fea_dim] = ( 38 | data_feature[:, :, fea_dim] + 1.0 39 | ) / 2.0 40 | 41 | data_feature[:, :, fea_dim] = ( 42 | data_feature[:, :, fea_dim] * (max_ - min_) + min_ 43 | ) 44 | 45 | fea_dim += 1 46 | else: 47 | fea_dim += output.dim 48 | 49 | tmp_gen_flags = np.expand_dims(gen_flags, axis=2) 50 | data_feature = data_feature * tmp_gen_flags 51 | 52 | data_attribute = data_attribute[:, 0:attr_dim_cp] 53 | 54 | return data_feature, data_attribute 55 | 56 | 57 | def normalize_per_sample( 58 | data_feature, data_attribute, data_feature_outputs, 59 | data_attribute_outputs, eps=1e-4): 60 | # assume all samples have maximum length 61 | data_feature_min = np.amin(data_feature, axis=1) 62 | data_feature_max = np.amax(data_feature, axis=1) 63 | 64 | additional_attribute = [] 65 | additional_attribute_outputs = [] 66 | 67 | dim = 0 68 | for output in data_feature_outputs: 69 | if output.type_ == OutputType.CONTINUOUS: 70 | for _ in range(output.dim): 71 | max_ = data_feature_max[:, dim] + eps 72 | min_ = data_feature_min[:, dim] - eps 73 | 74 | additional_attribute.append((max_ + min_) / 2.0) 75 | additional_attribute.append((max_ - min_) / 2.0) 76 | additional_attribute_outputs.append( 77 | Output( 78 | type_=OutputType.CONTINUOUS, 79 | dim=1, 80 | normalization=output.normalization, 81 | is_gen_flag=False, 82 | ) 83 | ) 84 | additional_attribute_outputs.append( 85 | Output( 86 | type_=OutputType.CONTINUOUS, 87 | dim=1, 88 | normalization=Normalization.ZERO_ONE, 89 | is_gen_flag=False, 90 | ) 91 | ) 92 | 93 | max_ = np.expand_dims(max_, axis=1) 94 | min_ = np.expand_dims(min_, axis=1) 95 | 96 | data_feature[:, :, dim] = (data_feature[:, :, dim] - min_) / ( 97 | max_ - min_ 98 | ) 99 | if output.normalization == Normalization.MINUSONE_ONE: 100 | data_feature[:, :, dim] = data_feature[:, 101 | :, dim] * 2.0 - 1.0 102 | 103 | dim += 1 104 | else: 105 | dim += output.dim 106 | 107 | real_attribute_mask = [True] * len(data_attribute_outputs) + [False] * len( 108 | additional_attribute_outputs 109 | ) 110 | 111 | additional_attribute = np.stack(additional_attribute, axis=1) 112 | data_attribute = np.concatenate( 113 | [data_attribute, additional_attribute], axis=1) 114 | data_attribute_outputs.extend(additional_attribute_outputs) 115 | 116 | return data_feature, data_attribute, data_attribute_outputs, real_attribute_mask 117 | 118 | 119 | def add_gen_flag(data_feature, data_gen_flag, data_feature_outputs, 120 | sample_len): 121 | for output in data_feature_outputs: 122 | if output.is_gen_flag: 123 | raise Exception("is_gen_flag should be False for all" 124 | "feature_outputs") 125 | 126 | if (data_feature.shape[2] != 127 | np.sum([t.dim for t in data_feature_outputs])): 128 | raise Exception("feature dimension does not match feature_outputs") 129 | 130 | if len(data_gen_flag.shape) != 2: 131 | raise Exception("data_gen_flag should be 2 dimension") 132 | 133 | num_sample, length = data_gen_flag.shape 134 | 135 | data_gen_flag = np.expand_dims(data_gen_flag, 2) 136 | 137 | data_feature_outputs.append(Output( 138 | type_=OutputType.DISCRETE, 139 | dim=2, 140 | is_gen_flag=True)) 141 | 142 | shift_gen_flag = np.concatenate( 143 | [data_gen_flag[:, 1:, :], 144 | np.zeros((data_gen_flag.shape[0], 1, 1))], 145 | axis=1) 146 | if length % sample_len != 0: 147 | raise Exception("length must be a multiple of sample_len") 148 | data_gen_flag_t = np.reshape( 149 | data_gen_flag, 150 | [num_sample, int(length / sample_len), sample_len]) 151 | data_gen_flag_t = np.sum(data_gen_flag_t, 2) 152 | data_gen_flag_t = data_gen_flag_t > 0.5 153 | data_gen_flag_t = np.repeat(data_gen_flag_t, sample_len, axis=1) 154 | data_gen_flag_t = np.expand_dims(data_gen_flag_t, 2) 155 | data_feature = np.concatenate( 156 | [data_feature, 157 | shift_gen_flag, 158 | (1 - shift_gen_flag) * data_gen_flag_t], 159 | axis=2) 160 | 161 | return data_feature, data_feature_outputs 162 | 163 | def reverse_gen_flag(gen_flags): 164 | gen_flags = np.concatenate((np.ones((gen_flags.shape[0], 1)), gen_flags[:, :-1]), axis=1) 165 | return gen_flags -------------------------------------------------------------------------------- /netshare/models/doppelganger_torch_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import inspect 5 | import numpy as np 6 | 7 | from .model import Model 8 | from netshare.utils import output 9 | # from gan import output # NOQA 10 | # sys.modules["output"] = output # NOQA 11 | from .doppelganger_torch.doppelganger import DoppelGANger # NOQA 12 | from .doppelganger_torch.util import add_gen_flag, normalize_per_sample, renormalize_per_sample, reverse_gen_flag # NOQA 13 | from .doppelganger_torch.load_data import load_data # NOQA 14 | 15 | 16 | class DoppelGANgerTorchModel(Model): 17 | def _train(self, input_train_data_folder, output_model_folder, log_folder): 18 | print(f"{self.__class__.__name__}.{inspect.stack()[0][3]}") 19 | 20 | self._config["result_folder"] = getattr( 21 | self._config, "result_folder", output_model_folder) 22 | self._config["dataset"] = getattr( 23 | self._config, "dataset", input_train_data_folder) 24 | 25 | print("Currently training with config:", self._config) 26 | # save config to the result folder 27 | with open(os.path.join( 28 | self._config["result_folder"], 29 | "config.json"), 'w') as fout: 30 | json.dump(self._config, fout) 31 | 32 | # load data 33 | ( 34 | data_feature, 35 | data_attribute, 36 | data_gen_flag, 37 | data_feature_outputs, 38 | data_attribute_outputs, 39 | ) = load_data( 40 | path=self._config["dataset"], 41 | sample_len=self._config["sample_len"]) 42 | num_real_attribute = len(data_attribute_outputs) 43 | 44 | # self-norm if applicable 45 | if self._config["self_norm"]: 46 | ( 47 | data_feature, 48 | data_attribute, 49 | data_attribute_outputs, 50 | real_attribute_mask 51 | ) = normalize_per_sample( 52 | data_feature, 53 | data_attribute, 54 | data_feature_outputs, 55 | data_attribute_outputs) 56 | else: 57 | real_attribute_mask = [True] * num_real_attribute 58 | 59 | data_feature, data_feature_outputs = add_gen_flag( 60 | data_feature, data_gen_flag, data_feature_outputs, self._config["sample_len"] 61 | ) 62 | 63 | # create directories 64 | checkpoint_dir = os.path.join( 65 | self._config["result_folder"], 66 | "checkpoint") 67 | if not os.path.exists(checkpoint_dir): 68 | os.makedirs(checkpoint_dir) 69 | sample_dir = os.path.join(self._config["result_folder"], "sample") 70 | if not os.path.exists(sample_dir): 71 | os.makedirs(sample_dir) 72 | time_path = os.path.join(self._config["result_folder"], "time.txt") 73 | 74 | dg = DoppelGANger( 75 | checkpoint_dir=checkpoint_dir, 76 | sample_dir=None, 77 | time_path=time_path, 78 | batch_size=self._config["batch_size"], 79 | real_attribute_mask=real_attribute_mask, 80 | max_sequence_len=data_feature.shape[1], 81 | sample_len=self._config["sample_len"], 82 | data_feature_outputs=data_feature_outputs, 83 | data_attribute_outputs=data_attribute_outputs, 84 | vis_freq=self._config["vis_freq"], 85 | vis_num_sample=self._config["vis_num_sample"], 86 | d_rounds=self._config["d_rounds"], 87 | g_rounds=self._config["g_rounds"], 88 | d_gp_coe=self._config["d_gp_coe"], 89 | num_packing=self._config["num_packing"], 90 | use_attr_discriminator=self._config["use_attr_discriminator"], 91 | attr_d_gp_coe=self._config["attr_d_gp_coe"], 92 | g_attr_d_coe=self._config["g_attr_d_coe"], 93 | epoch_checkpoint_freq=self._config["epoch_checkpoint_freq"], 94 | attribute_latent_dim=self._config["attribute_latent_dim"], 95 | feature_latent_dim=self._config["feature_latent_dim"], 96 | g_lr=self._config["g_lr"], 97 | g_beta1=self._config["g_beta1"], 98 | d_lr=self._config["d_lr"], 99 | d_beta1=self._config["d_beta1"], 100 | attr_d_lr=self._config["attr_d_lr"], 101 | attr_d_beta1=self._config["attr_d_beta1"], 102 | adam_eps=self._config["adam_eps"], 103 | adam_amsgrad=self._config["adam_amsgrad"], 104 | generator_attribute_num_units=self._config["generator_attribute_num_units"], 105 | generator_attribute_num_layers=self._config["generator_attribute_num_layers"], 106 | generator_feature_num_units=self._config["generator_feature_num_units"], 107 | generator_feature_num_layers=self._config["generator_feature_num_layers"], 108 | use_adaptive_rolling=self._config["use_adaptive_rolling"], 109 | discriminator_num_layers=self._config["discriminator_num_layers"], 110 | discriminator_num_units=self._config["discriminator_num_units"], 111 | attr_discriminator_num_layers=self._config["attr_discriminator_num_layers"], 112 | attr_discriminator_num_units=self._config["attr_discriminator_num_units"], 113 | restore=getattr(self._config, "restore", False), 114 | pretrain_dir=self._config["pretrain_dir"] 115 | ) 116 | 117 | dg.train( 118 | epochs=self._config["epochs"], 119 | data_feature=data_feature, 120 | data_attribute=data_attribute, 121 | data_gen_flag=data_gen_flag, 122 | ) 123 | 124 | def _generate(self, input_train_data_folder, 125 | input_model_folder, output_syn_data_folder, log_folder): 126 | print(f"{self.__class__.__name__}.{inspect.stack()[0][3]}") 127 | 128 | self._config["result_folder"] = getattr( 129 | self._config, "result_folder", input_model_folder) 130 | self._config["dataset"] = getattr( 131 | self._config, "dataset", input_train_data_folder) 132 | 133 | print("Currently generating with config:", self._config) 134 | 135 | # load data 136 | ( 137 | data_feature, 138 | data_attribute, 139 | data_gen_flag, 140 | data_feature_outputs, 141 | data_attribute_outputs, 142 | ) = load_data( 143 | path=self._config["dataset"], 144 | sample_len=self._config["sample_len"]) 145 | num_real_attribute = len(data_attribute_outputs) 146 | 147 | # self-norm if applicable 148 | if self._config["self_norm"]: 149 | ( 150 | data_feature, 151 | data_attribute, 152 | data_attribute_outputs, 153 | real_attribute_mask 154 | ) = normalize_per_sample( 155 | data_feature, 156 | data_attribute, 157 | data_feature_outputs, 158 | data_attribute_outputs) 159 | else: 160 | real_attribute_mask = [True] * num_real_attribute 161 | 162 | data_feature, data_feature_outputs = add_gen_flag( 163 | data_feature, data_gen_flag, data_feature_outputs, self._config["sample_len"] 164 | ) 165 | 166 | # create directories 167 | checkpoint_dir = os.path.join( 168 | self._config["result_folder"], 169 | "checkpoint") 170 | if not os.path.exists(checkpoint_dir): 171 | os.makedirs(checkpoint_dir) 172 | sample_dir = os.path.join(self._config["result_folder"], "sample") 173 | if not os.path.exists(sample_dir): 174 | os.makedirs(sample_dir) 175 | time_path = os.path.join(self._config["result_folder"], "time.txt") 176 | 177 | dg = DoppelGANger( 178 | checkpoint_dir=checkpoint_dir, 179 | sample_dir=None, 180 | time_path=time_path, 181 | batch_size=self._config["batch_size"], 182 | real_attribute_mask=real_attribute_mask, 183 | max_sequence_len=data_feature.shape[1], 184 | sample_len=self._config["sample_len"], 185 | data_feature_outputs=data_feature_outputs, 186 | data_attribute_outputs=data_attribute_outputs, 187 | vis_freq=self._config["vis_freq"], 188 | vis_num_sample=self._config["vis_num_sample"], 189 | d_rounds=self._config["d_rounds"], 190 | g_rounds=self._config["g_rounds"], 191 | d_gp_coe=self._config["d_gp_coe"], 192 | num_packing=self._config["num_packing"], 193 | use_attr_discriminator=self._config["use_attr_discriminator"], 194 | attr_d_gp_coe=self._config["attr_d_gp_coe"], 195 | g_attr_d_coe=self._config["g_attr_d_coe"], 196 | epoch_checkpoint_freq=self._config["epoch_checkpoint_freq"], 197 | attribute_latent_dim=self._config["attribute_latent_dim"], 198 | feature_latent_dim=self._config["feature_latent_dim"], 199 | g_lr=self._config["g_lr"], 200 | g_beta1=self._config["g_beta1"], 201 | d_lr=self._config["d_lr"], 202 | d_beta1=self._config["d_beta1"], 203 | attr_d_lr=self._config["attr_d_lr"], 204 | attr_d_beta1=self._config["attr_d_beta1"], 205 | adam_eps=self._config["adam_eps"], 206 | adam_amsgrad=self._config["adam_amsgrad"], 207 | generator_attribute_num_units=self._config["generator_attribute_num_units"], 208 | generator_attribute_num_layers=self._config["generator_attribute_num_layers"], 209 | generator_feature_num_units=self._config["generator_feature_num_units"], 210 | generator_feature_num_layers=self._config["generator_feature_num_layers"], 211 | use_adaptive_rolling=self._config["use_adaptive_rolling"], 212 | discriminator_num_layers=self._config["discriminator_num_layers"], 213 | discriminator_num_units=self._config["discriminator_num_units"], 214 | attr_discriminator_num_layers=self._config["attr_discriminator_num_layers"], 215 | attr_discriminator_num_units=self._config["attr_discriminator_num_units"], 216 | restore=getattr(self._config, "restore", False), 217 | pretrain_dir=self._config["pretrain_dir"] 218 | ) 219 | 220 | if self._config["given_data_attribute_flag"]: 221 | print("Generating from a given data attribute!") 222 | given_attr_npz_file = os.path.join( 223 | output_syn_data_folder, 224 | "attr_clean", 225 | "chunk_id-{}.npz".format(self._config["chunk_id"])) 226 | 227 | if not os.path.exists(given_attr_npz_file): 228 | raise ValueError( 229 | f"Given data attribute file {given_attr_npz_file}") 230 | given_data_attribute = np.load(given_attr_npz_file)[ 231 | "data_attribute"] 232 | given_data_attribute_discrete = np.load(given_attr_npz_file)[ 233 | "data_attribute_discrete"] 234 | # print("given_data_attribute:", given_data_attribute.shape) 235 | # print("given_data_attribute_discrete:", 236 | # given_data_attribute_discrete) 237 | else: 238 | print("Generating w/o given data attribute!") 239 | given_data_attribute = None 240 | given_data_attribute_discrete = None 241 | 242 | last_iteration_found = False 243 | epoch_range = list( 244 | range( 245 | self._config["epoch_checkpoint_freq"] - 1, 246 | self._config["epochs"], 247 | self._config["epoch_checkpoint_freq"], 248 | ) 249 | ) 250 | # reverse list in place 251 | epoch_range.reverse() 252 | generatedSamples_per_epoch = 1 253 | 254 | for epoch_id in epoch_range: 255 | if last_iteration_found and \ 256 | not self._config["given_data_attribute_flag"] and getattr(self._config, "n_chunks") > 1: 257 | break 258 | 259 | print("Processing epoch_id: {}".format(epoch_id)) 260 | mid_checkpoint_dir = os.path.join( 261 | checkpoint_dir, "epoch_id-{}.pt".format(epoch_id) 262 | ) 263 | if not os.path.exists(mid_checkpoint_dir): 264 | print("Not found {}".format(mid_checkpoint_dir)) 265 | continue 266 | else: 267 | last_iteration_found = True 268 | for generated_samples_idx in range(generatedSamples_per_epoch): 269 | print( 270 | "generate {}-th sample from epoch_id-{}".format( 271 | generated_samples_idx + 1, epoch_id 272 | ) 273 | ) 274 | 275 | num_samples = (data_attribute.shape[0] if ((given_data_attribute is None) and ( 276 | given_data_attribute_discrete is None)) else given_data_attribute.shape[0]) 277 | 278 | dg.load(mid_checkpoint_dir) 279 | print("Finished loading") 280 | 281 | ( 282 | features, 283 | attributes, 284 | attributes_discrete, 285 | gen_flags, 286 | lengths 287 | ) = dg.generate( 288 | num_samples=num_samples, 289 | given_attribute=given_data_attribute, 290 | given_attribute_discrete=given_data_attribute_discrete) 291 | 292 | gen_flags = reverse_gen_flag(gen_flags) 293 | 294 | if self._config["self_norm"]: 295 | features, attributes = renormalize_per_sample( 296 | features, 297 | attributes, 298 | data_feature_outputs, 299 | data_attribute_outputs, 300 | gen_flags, 301 | num_real_attribute=num_real_attribute, 302 | ) 303 | 304 | print(features.shape) 305 | print(attributes.shape) 306 | 307 | if getattr(self._config, "save_without_chunk", False) or getattr(self._config, "n_chunks") == 1: 308 | save_path = os.path.join( 309 | output_syn_data_folder, 310 | "feat_raw", 311 | "chunk_id-0") 312 | os.makedirs(save_path, exist_ok=True) 313 | np.savez( 314 | os.path.join( 315 | save_path, 316 | f"epoch_id-{epoch_id}.npz"), 317 | data_attribute=attributes, 318 | data_feature=features, 319 | data_gen_flag=gen_flags) 320 | elif not self._config["given_data_attribute_flag"]: 321 | save_path = os.path.join( 322 | output_syn_data_folder, "attr_raw") 323 | os.makedirs(save_path, exist_ok=True) 324 | np.savez( 325 | os.path.join( 326 | save_path, 327 | "chunk_id-{}.npz".format( 328 | self._config["chunk_id"]) 329 | ), 330 | data_attribute=attributes, 331 | data_attribute_discrete=attributes_discrete 332 | ) 333 | print(os.path.join( 334 | save_path, 335 | "chunk_id-{}.npz".format( 336 | self._config["chunk_id"]) 337 | )) 338 | else: 339 | save_path = os.path.join( 340 | output_syn_data_folder, 341 | "feat_raw", 342 | f"chunk_id-{self._config['chunk_id']}") 343 | os.makedirs(save_path, exist_ok=True) 344 | np.savez( 345 | os.path.join( 346 | save_path, 347 | f"epoch_id-{epoch_id}.npz" 348 | ), 349 | data_attribute=attributes, 350 | data_feature=features, 351 | data_gen_flag=gen_flags, 352 | config=self._config 353 | ) 354 | -------------------------------------------------------------------------------- /netshare/models/model.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import os 3 | 4 | from netshare.utils import Tee 5 | 6 | 7 | class Model(ABC): 8 | def __init__(self, config): 9 | self._config = config 10 | 11 | @abstractmethod 12 | def _train(self, input_train_data_folder, output_model_folder, log_folder): 13 | ... 14 | 15 | @abstractmethod 16 | def _generate(self, input_train_data_folder, 17 | input_model_folder, output_syn_data_folder, log_folder): 18 | ... 19 | 20 | def train(self, input_train_data_folder, output_model_folder, log_folder): 21 | stdout_log_path = os.path.join(log_folder, 'model.train.stdout.log') 22 | stderr_log_path = os.path.join(log_folder, 'model.train.stderr.log') 23 | with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path): 24 | return self._train( 25 | input_train_data_folder=input_train_data_folder, 26 | output_model_folder=output_model_folder, 27 | log_folder=log_folder) 28 | 29 | def generate(self, input_train_data_folder, input_model_folder, 30 | output_syn_data_folder, log_folder): 31 | stdout_log_path = os.path.join(log_folder, 'model.generate.stdout.log') 32 | stderr_log_path = os.path.join(log_folder, 'model.generate.stderr.log') 33 | with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path): 34 | return self._generate( 35 | input_train_data_folder=input_train_data_folder, 36 | input_model_folder=input_model_folder, 37 | output_syn_data_folder=output_syn_data_folder, 38 | log_folder=log_folder) 39 | -------------------------------------------------------------------------------- /netshare/pre_post_processors/__init__.py: -------------------------------------------------------------------------------- 1 | from .pre_post_processor import PrePostProcessor 2 | from .netshare.netshare_pre_post_processor import NetsharePrePostProcessor 3 | from .dg_row_per_sample_pre_post_processor import DGRowPerSamplePrePostProcessor 4 | 5 | __all__ = [ 6 | 'PrePostProcessor', 7 | 'NetsharePrePostProcessor', 8 | 'DGRowPerSamplePrePostProcessor'] 9 | -------------------------------------------------------------------------------- /netshare/pre_post_processors/dg_row_per_sample_pre_post_processor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | import os 5 | import csv 6 | from tqdm import tqdm 7 | 8 | from .pre_post_processor import PrePostProcessor 9 | from netshare.utils.field import ContinuousField, DiscreteField 10 | from netshare.utils.output import Normalization 11 | 12 | EPS = 1e-8 13 | 14 | 15 | class DGRowPerSamplePrePostProcessor(PrePostProcessor): 16 | def _pre_process(self, input_folder, output_folder, log_folder): 17 | # input is a file path 18 | file_path = input_folder 19 | 20 | original_df = pd.read_csv(file_path) 21 | 22 | # Remove missing rows. 23 | original_df.dropna(inplace=True) 24 | 25 | # Parse data. 26 | metadata_numpys = [] 27 | metadata_fields = [] 28 | for i, field in enumerate(self._config.metadata): 29 | if not isinstance(field.column, str): 30 | raise ValueError('"column" should be a string') 31 | this_df = original_df[field.column].astype(str) 32 | if 'regex' in field: 33 | this_df = this_df.str.extract(field.regex, expand=False) 34 | if field.type == 'string': 35 | choices = list(pd.unique(this_df)) 36 | field_instance = DiscreteField( 37 | choices=choices, 38 | name=getattr(field, 'name', field.column)) 39 | this_numpy = field_instance.normalize(this_df.to_numpy()) 40 | elif field.type == 'float': 41 | this_df = this_df.astype(np.float64) 42 | this_numpy = this_df.to_numpy() 43 | this_numpy = this_numpy.reshape((this_df.shape[0], 1)) 44 | field_instance = ContinuousField( 45 | norm_option=getattr(Normalization, field.normalization), 46 | min_x=this_numpy.min() - EPS, 47 | max_x=this_numpy.max() + EPS, 48 | dim_x=1, 49 | name=getattr(field, 'name', field.column)) 50 | this_numpy = field_instance.normalize(this_numpy) 51 | else: 52 | raise ValueError(f'Unknown field type {field.type}') 53 | metadata_numpys.append(this_numpy) 54 | metadata_fields.append(field_instance) 55 | metadata_numpy = np.concatenate( 56 | metadata_numpys, axis=1).astype(np.float64) 57 | print(f'List of metadata: ' 58 | f'{list((k.dtype, k.shape) for k in metadata_numpys)}') 59 | print(f'Metadata type: {metadata_numpy.dtype}, ' 60 | f'shape: {metadata_numpy.shape}') 61 | 62 | timeseries_numpys = [] 63 | timeseries_fields = [] 64 | for i, field in enumerate(self._config.timeseries): 65 | if not isinstance(field.columns, list): 66 | raise ValueError('"columns" should be a list') 67 | this_df = original_df[field.columns].astype(str) 68 | if 'regex' in field: 69 | for column in field.columns: 70 | this_df[column] = this_df[column].str.extract( 71 | field.regex, expand=False) 72 | if field.type == 'string': 73 | choices = list(pd.unique(this_df.values.ravel('K'))) 74 | field_instance = DiscreteField( 75 | choices=choices, 76 | name=getattr(field, 'name', field.columns)) 77 | this_numpy = field_instance.normalize(this_df.to_numpy()) 78 | this_numpy = this_numpy.reshape( 79 | (this_df.shape[0], len(field.columns), len(choices))) 80 | elif field.type == 'float': 81 | this_df = this_df.astype(np.float64) 82 | this_numpy = this_df.to_numpy() 83 | this_numpy = this_numpy.reshape( 84 | (this_df.shape[0], len(field.columns), 1)) 85 | if getattr(field, 'log1p_norm', False): 86 | this_numpy = np.log1p(this_numpy) 87 | field_instance = ContinuousField( 88 | norm_option=getattr(Normalization, field.normalization), 89 | min_x=this_numpy.min() - EPS, 90 | max_x=this_numpy.max() + EPS, 91 | dim_x=1, 92 | name=getattr(field, 'name', field.columns)) 93 | this_numpy = field_instance.normalize(this_numpy) 94 | else: 95 | raise ValueError(f'Unknown field type {field.type}') 96 | timeseries_numpys.append(this_numpy) 97 | timeseries_fields.append(field_instance) 98 | timeseries_numpy = np.concatenate(timeseries_numpys, axis=2).astype( 99 | np.float64) 100 | print(f'List of timeseries: ' 101 | f'{list((k.dtype, k.shape) for k in timeseries_numpys)}') 102 | print(f'Timeseries type: {timeseries_numpy.dtype}, ' 103 | f'shape: {timeseries_numpy.shape}') 104 | 105 | # Randomly select the required number of samples. 106 | np.random.seed(getattr(self._config, 'random_seed', 0)) 107 | ids = np.random.permutation(metadata_numpy.shape[0]) 108 | metadata_train_numpy = metadata_numpy[ 109 | ids[:self._config.num_train_samples]] 110 | timeseries_train_numpy = timeseries_numpy[ 111 | ids[:self._config.num_train_samples]] 112 | 113 | print(f'Metadata train type: {metadata_train_numpy.dtype}, ' 114 | f'shape: {metadata_train_numpy.shape}') 115 | print(f'Timeseries train type: {timeseries_train_numpy.dtype}, ' 116 | f'shape: {timeseries_train_numpy.shape}') 117 | 118 | # Write files 119 | with open(os.path.join( 120 | output_folder, 'data_attribute_output.pkl'), 'wb') as f: 121 | pickle.dump([v.getOutputType() for v in metadata_fields], f) 122 | with open(os.path.join( 123 | output_folder, 'data_feature_output.pkl'), 'wb') as f: 124 | pickle.dump([v.getOutputType() for v in timeseries_fields], f) 125 | with open(os.path.join( 126 | output_folder, 'data_attribute_fields.pkl'), 'wb') as f: 127 | pickle.dump(metadata_fields, f) 128 | with open(os.path.join( 129 | output_folder, 'data_feature_fields.pkl'), 'wb') as f: 130 | pickle.dump(timeseries_fields, f) 131 | npz_folder = os.path.join(output_folder, 'data_train_npz') 132 | os.makedirs(npz_folder) 133 | for i in range(metadata_train_numpy.shape[0]): 134 | np.savez( 135 | os.path.join(npz_folder, f'data_train_{i}.npz'), 136 | data_feature=timeseries_train_numpy[i], 137 | data_attribute=metadata_train_numpy[i], 138 | data_gen_flag=np.ones(timeseries_train_numpy.shape[1]), 139 | global_max_flow_len=[timeseries_train_numpy.shape[1]]) 140 | 141 | return True 142 | 143 | def _post_process(self, input_folder, output_folder, 144 | pre_processed_data_folder, log_folder): 145 | with open(os.path.join( 146 | pre_processed_data_folder, 147 | 'data_attribute_fields.pkl'), 'rb') as f: 148 | metadata_fields = pickle.load(f) 149 | with open(os.path.join( 150 | pre_processed_data_folder, 151 | 'data_feature_fields.pkl'), 'rb') as f: 152 | timeseries_fields = pickle.load(f) 153 | sub_folders = os.listdir(input_folder) 154 | for sub_folder in sub_folders: 155 | data_path = os.path.join(input_folder, sub_folder, 'data.npz') 156 | data = np.load(data_path) 157 | unnormalized_timeseries = data['data_feature'] 158 | unnormalized_metadata = data['data_attribute'] 159 | data_gen_flag = data['data_gen_flag'] 160 | timeseries = [] 161 | metadata = [] 162 | dim = 0 163 | for field_i, field in enumerate(metadata_fields): 164 | sub_metadata = field.denormalize( 165 | unnormalized_metadata[ 166 | :, dim: dim + field.getOutputType().dim]) 167 | if getattr(self._config.metadata[field_i], 'log1p_norm', 168 | False): 169 | sub_metadata = np.exp(sub_metadata) - 1 170 | if isinstance(field, ContinuousField): 171 | sub_metadata = sub_metadata[:, 0] 172 | metadata.append(sub_metadata) 173 | dim += field.getOutputType().dim 174 | assert dim == unnormalized_metadata.shape[1] 175 | 176 | timeseries = [] 177 | dim = 0 178 | for field_i, field in enumerate(timeseries_fields): 179 | sub_timeseries = field.denormalize( 180 | unnormalized_timeseries[ 181 | :, :, dim: dim + field.getOutputType().dim]) 182 | if getattr(self._config.timeseries[field_i], 'log1p_norm', 183 | False): 184 | sub_timeseries = np.exp(sub_timeseries) - 1 185 | if isinstance(field, ContinuousField): 186 | sub_timeseries = sub_timeseries[:, :, 0] 187 | timeseries.append(sub_timeseries) 188 | dim += field.getOutputType().dim 189 | assert dim == unnormalized_timeseries.shape[2] 190 | 191 | csv_folder = os.path.join(output_folder, sub_folder) 192 | os.makedirs(csv_folder) 193 | csv_path = os.path.join(csv_folder, 'data.csv') 194 | with open(csv_path, 'w') as f: 195 | writer = csv.writer(f) 196 | writer.writerow( 197 | [field.name for field in metadata_fields] + 198 | [column_name for field in timeseries_fields 199 | for column_name in field.name]) 200 | for i in tqdm(range(unnormalized_timeseries.shape[0])): 201 | writer.writerow( 202 | [d[i] for d in metadata] + 203 | [sd 204 | for d in timeseries 205 | for sd in d[i][:int(np.sum(data_gen_flag[i]))]]) 206 | return True 207 | -------------------------------------------------------------------------------- /netshare/pre_post_processors/netshare/README.md: -------------------------------------------------------------------------------- 1 | In case of any change to the `main.c` and `packet.h` file, please use `sharedlib.sh` to create new shared library. -------------------------------------------------------------------------------- /netshare/pre_post_processors/netshare/choose_best_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from scipy.stats import rankdata 8 | from .util import create_sdmetrics_config, convert_sdmetricsConfigQuant_to_fieldValueDict 9 | from sdmetrics.reports.timeseries import QualityReport 10 | 11 | 12 | def compare_rawdf_syndfs( 13 | raw_df, 14 | syn_dfs, 15 | config_pre_post_processor 16 | ): 17 | # Compare raw_df and syn_dfs and return the best syn_df 18 | sdmetrics_config = create_sdmetrics_config( 19 | config_pre_post_processor, 20 | comparison_type='quantitative') 21 | report = QualityReport(config_dict=sdmetrics_config['config']) 22 | 23 | metrics_dict_list = [] 24 | for syn_df in syn_dfs: 25 | report.generate( 26 | raw_df, syn_df, sdmetrics_config['metadata']) 27 | metrics_dict_list.append( 28 | convert_sdmetricsConfigQuant_to_fieldValueDict( 29 | report.dict_metric_scores)) 30 | 31 | metrics = list(metrics_dict_list[0].keys()) 32 | metric_vals_dict = {} 33 | for metrics_dict in metrics_dict_list: 34 | for metric in metrics: 35 | if metric not in metric_vals_dict: 36 | metric_vals_dict[metric] = [] 37 | metric_vals_dict[metric].append(metrics_dict[metric]) 38 | metric_vals_2d = [] 39 | for metric, vals in metric_vals_dict.items(): 40 | metric_vals_2d.append(vals) 41 | rankings_sum = np.sum(rankdata(metric_vals_2d, axis=1), axis=0) 42 | best_syndf_idx = np.argmin(rankdata(rankings_sum)) 43 | 44 | return best_syndf_idx, syn_dfs[best_syndf_idx] 45 | 46 | 47 | def choose_best_model( 48 | config_pre_post_processor, 49 | pre_processed_data_folder, 50 | generated_data_folder, 51 | post_processed_data_folder 52 | ): 53 | with open(os.path.join(generated_data_folder, "configs_generate.json"), 'r') as f: 54 | data = json.load(f) 55 | configs = data["configs"] 56 | # add pre_processor configs in place 57 | for config in configs: 58 | config.update(config_pre_post_processor) 59 | config_group_list = data["config_group_list"] 60 | 61 | # TODO: change to distribute (Ray-style) 62 | dict_dataset_syndfs = {} 63 | for config_group_idx, config_group in enumerate(config_group_list): 64 | print("Config group #{}: {}".format(config_group_idx, config_group)) 65 | config_ids = config_group["config_ids"] 66 | chunk0_idx = config_ids[0] 67 | syndf_root_folder = os.path.join( 68 | configs[chunk0_idx]["eval_root_folder"], "syn_dfs" 69 | ) 70 | assert len( 71 | [ 72 | file 73 | for file in os.listdir(syndf_root_folder) 74 | if file.startswith("chunk_id") 75 | ] 76 | ) == len(config_ids) 77 | 78 | best_syndfs = [] 79 | truncate_ratios = [] 80 | for chunk_id, config_idx in enumerate(config_ids): 81 | config = configs[config_idx] 82 | raw_df = pd.read_csv(os.path.join(config["dataset"], "raw.csv")) 83 | time_col_name = getattr( 84 | getattr(config_pre_post_processor, 'timestamp'), 85 | 'column') 86 | 87 | syn_dfs = [] 88 | syn_dfs_names = [] 89 | syn_df_folder = os.path.join( 90 | syndf_root_folder, "chunk_id-{}".format(chunk_id) 91 | ) 92 | for file in os.listdir(syn_df_folder): 93 | if file.endswith(".csv"): 94 | syn_dfs_names.append(file) 95 | syn_df = pd.read_csv(os.path.join(syn_df_folder, file)) 96 | 97 | # truncate to raw data time range 98 | if config["truncate"] == "per_chunk": 99 | syn_df_truncated = syn_df[ 100 | (syn_df[time_col_name] >= raw_df[time_col_name].min()) 101 | & (syn_df[time_col_name] <= raw_df[time_col_name].max()) 102 | ] 103 | # TODO: support more truncation methods if necessary 104 | else: 105 | raise ValueError("Unknown truncation methods...") 106 | truncate_ratios.append( 107 | 1.0 - len(syn_df_truncated) / len(syn_df)) 108 | 109 | syn_dfs.append(syn_df_truncated) 110 | 111 | best_syndf_idx, best_syndf = compare_rawdf_syndfs( 112 | raw_df[syn_dfs[0].columns], syn_dfs, config_pre_post_processor 113 | ) 114 | 115 | best_syndfs.append(best_syndf) 116 | print( 117 | "Chunk_id: {}, # of syn dfs: {}, best_syndf: {}".format( 118 | chunk_id, len(syn_dfs), syn_dfs_names[best_syndf_idx] 119 | ) 120 | ) 121 | 122 | print("Average truncation ratio:", np.mean(truncate_ratios)) 123 | big_best_syndf = pd.concat(best_syndfs) 124 | print("Big syndf shape:", big_best_syndf.shape) 125 | print() 126 | 127 | if config_group["dp_noise_multiplier"] not in dict_dataset_syndfs: 128 | dict_dataset_syndfs[config_group["dp_noise_multiplier"]] = [] 129 | dict_dataset_syndfs[config_group["dp_noise_multiplier"]].append( 130 | big_best_syndf) 131 | 132 | dict_dataset_bestsyndf = {} 133 | 134 | big_raw_df = pd.read_csv(os.path.join(pre_processed_data_folder, "raw.csv")) 135 | for dpnoisemultiplier, syn_dfs in dict_dataset_syndfs.items(): 136 | assert len(syn_dfs) >= 1 137 | if len(syn_dfs) > 1: 138 | best_syndf_idx, best_syn_df = compare_rawdf_syndfs( 139 | big_raw_df[syn_dfs[0].columns], 140 | syn_dfs, config_pre_post_processor) 141 | dict_dataset_bestsyndf[dpnoisemultiplier] = best_syn_df 142 | else: 143 | dict_dataset_bestsyndf[dpnoisemultiplier] = syn_dfs[0] 144 | 145 | print("Aggregated final dataset syndf") 146 | for dp_noise_multiplier, best_syndf in dict_dataset_bestsyndf.items(): 147 | print(dp_noise_multiplier, best_syndf.shape) 148 | best_syndf_folder = post_processed_data_folder 149 | os.makedirs(best_syndf_folder, exist_ok=True) 150 | 151 | # find best syndf index i.e., for evaluation fairness 152 | cur_max_idx = None 153 | for file in os.listdir(best_syndf_folder): 154 | if file.startswith( 155 | "syn_df,dp_noise_multiplier-{},truncate-{},id-".format( 156 | dp_noise_multiplier, config["truncate"] 157 | ) 158 | ): 159 | this_id = int(os.path.splitext(file)[ 160 | 0].split(",")[-1].split("-")[1]) 161 | if cur_max_idx is None or this_id > cur_max_idx: 162 | cur_max_idx = this_id 163 | if cur_max_idx is None: 164 | cur_max_idx = 1 165 | else: 166 | cur_max_idx += 1 167 | 168 | best_syndf_filename = os.path.join( 169 | best_syndf_folder, 170 | "syn_df,dp_noise_multiplier-{},truncate-{},id-{}.csv".format( 171 | dp_noise_multiplier, config["truncate"], cur_max_idx) 172 | ) 173 | # best_syndf_filename = os.path.join(best_syndf_folder, "syn.csv") 174 | 175 | print("best_syn_df filename:", best_syndf_filename) 176 | 177 | # sort by timestamp if applicable 178 | if config_pre_post_processor.timestamp.generation: 179 | time_col_name = config_pre_post_processor.timestamp.column 180 | best_syndf = best_syndf.sort_values(time_col_name) 181 | best_syndf.to_csv(best_syndf_filename, index=False) 182 | -------------------------------------------------------------------------------- /netshare/pre_post_processors/netshare/denormalize_fields.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import json 4 | import random 5 | import pickle 6 | from typing import Dict, List 7 | 8 | import numpy as np 9 | from config_io import Config 10 | from tqdm import tqdm 11 | 12 | from netshare.utils.logger import logger 13 | 14 | 15 | def _get_fields_names(fields_list): 16 | """ 17 | This function returns the names of the given fields. 18 | """ 19 | field_names = [] 20 | for field in fields_list: 21 | if isinstance(field.name, list): 22 | field_names.extend(field.name) 23 | else: 24 | field_names.append(field.name) 25 | return field_names 26 | 27 | 28 | def _denormalize_by_fields_list( 29 | normalized_data, 30 | fields_list, 31 | is_session_key 32 | ): 33 | """ 34 | This function executes field.denormalize for each of the given field. 35 | """ 36 | denormalized_data = [] 37 | dim = 0 38 | 39 | for field in fields_list: 40 | if is_session_key: 41 | sub_data = normalized_data[:, dim: dim + field.dim_x] 42 | else: 43 | sub_data = normalized_data[:, :, dim: dim + field.dim_x] 44 | 45 | sub_data = field.denormalize(sub_data) 46 | 47 | # For session key, if shape looks like (n, ), change it to (n, 1) for consistency 48 | if is_session_key and len(sub_data.shape) == 1: 49 | sub_data = np.expand_dims(sub_data, axis=1) 50 | # For timeseries, if shape looks like (i, j), change it to (i, j, 1) for consistency 51 | if not is_session_key and len(sub_data.shape) == 2: 52 | sub_data = np.expand_dims(sub_data, axis=2) 53 | denormalized_data.append(sub_data) 54 | dim += field.dim_x 55 | return denormalized_data 56 | 57 | 58 | def write_to_csv( 59 | csv_folder, 60 | session_key_fields, 61 | timeseries_fields, 62 | session_key, 63 | timeseries, 64 | data_gen_flag, 65 | filename, 66 | config, 67 | ) -> None: 68 | """ 69 | This function dumps the given data to the given directory as a csv format. 70 | `data_gen_flag` is an indicator showing if the time series for this session 71 | has ended in this time step. 72 | """ 73 | os.makedirs(csv_folder, exist_ok=True) 74 | csv_path = os.path.join(csv_folder, filename) 75 | # change session key shape to #session * #attributes 76 | session_key_numpy = np.array(np.concatenate(session_key, axis=1)) 77 | # change timeseries shape to #session * #time_steps * #features 78 | timeseries_numpy = np.array(np.concatenate(timeseries, axis=2)) 79 | 80 | with open(csv_path, "w") as f: 81 | writer = csv.writer(f) 82 | raw_metadata_field_names = [ 83 | col.column for col in (config["metadata"]) 84 | ] 85 | raw_timeseries_filed_names = [ 86 | col.column for col in config["timeseries"]] 87 | session_titles = [ 88 | f for i, f in enumerate(_get_fields_names(session_key_fields)) 89 | if f in raw_metadata_field_names] 90 | session_titles_idx = [ 91 | i for i, f in enumerate(_get_fields_names(session_key_fields)) 92 | if f in raw_metadata_field_names] 93 | timeseries_titles = [ 94 | f 95 | for i, f in enumerate(_get_fields_names(timeseries_fields)) 96 | if f in raw_timeseries_filed_names 97 | ] 98 | timeseries_titles_idx = [ 99 | i 100 | for i, f in enumerate(_get_fields_names(timeseries_fields)) 101 | if f in raw_timeseries_filed_names 102 | ] 103 | 104 | if config["timestamp"].get("generation", False): 105 | timeseries_titles.append(config["timestamp"]["column"]) 106 | if config["timestamp"]["encoding"] == "interarrival": 107 | # Find `flow_start` and `interarrival_within_flow` index 108 | flow_start_idx, interarrival_within_flow_idx = None, None 109 | for idx, field_name in enumerate( 110 | _get_fields_names(session_key_fields)): 111 | if field_name == "flow_start": 112 | flow_start_idx = idx 113 | break 114 | for idx, field_name in enumerate( 115 | _get_fields_names(timeseries_fields)): 116 | if field_name == "interarrival_within_flow": 117 | interarrival_within_flow_idx = idx 118 | break 119 | if flow_start_idx is None or interarrival_within_flow_idx is None: 120 | raise ValueError( 121 | "Using `interarrival` encoding: `flow_start` or `interarrival_field` not found!" 122 | ) 123 | 124 | # convert interarrival to raw timestamp 125 | interarrival_cumsum = np.cumsum( 126 | timeseries_numpy[:, :, interarrival_within_flow_idx].astype( 127 | float), 128 | axis=1) 129 | # first packet has 0.0 interarrival 130 | interarrival_cumsum[:, 0] = 0.0 131 | flow_start_expand = ( 132 | np.array( 133 | [ 134 | session_key_numpy[:, flow_start_idx], 135 | ] 136 | * interarrival_cumsum.shape[1] 137 | ) 138 | .transpose() 139 | .astype(float) 140 | ) 141 | timestamp_matrix = np.expand_dims( 142 | np.add(flow_start_expand, interarrival_cumsum), axis=2 143 | ) 144 | timeseries_numpy = np.concatenate( 145 | (timeseries_numpy, timestamp_matrix), axis=2 146 | ) 147 | timeseries_titles_idx.append(timeseries_numpy.shape[2] - 1) 148 | 149 | writer.writerow(session_titles + timeseries_titles) 150 | 151 | session_key_set = set() 152 | for ( 153 | data_gen_per_session, 154 | session_data_per_session, 155 | timeseries_per_session, 156 | ) in zip( 157 | data_gen_flag, 158 | # remove cols not in raw data 159 | session_key_numpy[:, session_titles_idx], 160 | timeseries_numpy[ 161 | :, :, timeseries_titles_idx 162 | ], # remove cols not in raw data 163 | ): 164 | session_data_per_session = session_data_per_session.tolist() 165 | # remove duplicated session keys 166 | if tuple(session_data_per_session) in session_key_set: 167 | logger.debug( 168 | "Session key {session_data_per_session} already exists!") 169 | continue 170 | session_key_set.add(tuple(session_data_per_session)) 171 | for j in range(data_gen_per_session.shape[0]): 172 | if data_gen_per_session[j] == 1.0: 173 | timeseries_data = timeseries_per_session[j].tolist() 174 | writer.writerow(session_data_per_session + timeseries_data) 175 | 176 | 177 | def denormalize_fields( 178 | config_pre_post_processor, 179 | pre_processed_data_folder, 180 | generated_data_folder, 181 | post_processed_data_folder 182 | ): 183 | """ 184 | This function denormalizes the data in the generated_data folder using the attributes and features fields that were created in the pre-process step. 185 | Last, it writes the denormalized data to a csv file under the same directory hierarchy as the created data. 186 | 187 | :return: the path to the denormalized data. 188 | """ 189 | with open(os.path.join(generated_data_folder, "configs_generate.json"), 'r') as f: 190 | data = json.load(f) 191 | configs = data["configs"] 192 | config_group_list = data["config_group_list"] 193 | 194 | for config in tqdm(configs): 195 | with open(os.path.join( 196 | pre_processed_data_folder, 197 | f"chunkid-{config['chunk_id']}", 198 | "data_attribute_fields.pkl" 199 | ), 'rb') as f: 200 | session_key_fields = list(pickle.load(f)) 201 | 202 | with open(os.path.join( 203 | pre_processed_data_folder, 204 | f"chunkid-{config['chunk_id']}", 205 | "data_feature_fields.pkl" 206 | ), 'rb') as f: 207 | timeseries_fields = list(pickle.load(f)) 208 | 209 | # Each configuration has multiple iteration ckpts 210 | per_chunk_basedir = os.path.join( 211 | config["eval_root_folder"], 212 | "feat_raw", f"chunk_id-{config['chunk_id']}") 213 | for f in os.listdir(per_chunk_basedir): 214 | if not f.endswith(".npz"): 215 | continue 216 | data = np.load(os.path.join(per_chunk_basedir, f)) 217 | unnormalized_session_key = data["data_attribute"] 218 | unnormalized_timeseries = data["data_feature"] 219 | data_gen_flag = data["data_gen_flag"] 220 | 221 | session_key = _denormalize_by_fields_list( 222 | unnormalized_session_key, session_key_fields, 223 | is_session_key=True) 224 | timeseries = _denormalize_by_fields_list( 225 | unnormalized_timeseries, timeseries_fields, is_session_key=False 226 | ) 227 | 228 | csv_root_folder = os.path.join( 229 | config["eval_root_folder"], "syn_dfs") 230 | csv_filename = f.replace(".npz", ".csv") 231 | write_to_csv( 232 | csv_folder=os.path.join( 233 | csv_root_folder, f"chunk_id-{config['chunk_id']}" 234 | ), 235 | session_key_fields=session_key_fields, 236 | timeseries_fields=timeseries_fields, 237 | session_key=session_key, 238 | timeseries=timeseries, 239 | data_gen_flag=data_gen_flag, 240 | filename=csv_filename, 241 | config=config_pre_post_processor, 242 | ) 243 | -------------------------------------------------------------------------------- /netshare/pre_post_processors/netshare/dist_metrics.py: -------------------------------------------------------------------------------- 1 | from .embedding_helper import build_annoy_dictionary_word2vec, get_original_obj 2 | from netshare.utils import ContinuousField, DiscreteField, BitField 3 | from netshare.utils import Normalization 4 | from netshare.utils import Tee, Output 5 | from scipy.spatial import distance 6 | from scipy.stats import wasserstein_distance 7 | from collections import Counter, OrderedDict 8 | from gensim.models import Word2Vec 9 | from tqdm import tqdm 10 | import statsmodels.api as sm 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | import matplotlib 14 | import numpy as np 15 | import sys 16 | import configparser 17 | import json 18 | import random 19 | import copy 20 | import math 21 | import os 22 | import pickle 23 | random.seed(42) 24 | 25 | 26 | # avoid type3 fonts 27 | matplotlib.rcParams['pdf.fonttype'] = 42 28 | matplotlib.rcParams['ps.fonttype'] = 42 29 | matplotlib.rcParams.update({'font.size': 15}) 30 | 31 | # color-blindness friendly 32 | CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a', 33 | '#f781bf', '#a65628', '#984ea3', 34 | '#999999', '#e41a1c', '#dede00'] 35 | # colors = { 36 | # 'blue': [55, 126, 184], #377eb8 37 | # 'orange': [255, 127, 0], #ff7f00 38 | # 'green': [77, 175, 74], #4daf4a 39 | # 'pink': [247, 129, 191], #f781bf 40 | # 'brown': [166, 86, 40], #a65628 41 | # 'purple': [152, 78, 163], #984ea3 42 | # 'gray': [153, 153, 153], #999999 43 | # 'red': [228, 26, 28], #e41a1c 44 | # 'yellow': [222, 222, 0] #dede00 45 | # } 46 | 47 | # https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml 48 | dict_pr_str2int = { 49 | "ESP": 50, 50 | "GRE": 47, 51 | "ICMP": 1, 52 | "IPIP": 4, 53 | "IPv6": 41, 54 | "TCP": 6, 55 | "UDP": 17, 56 | "RSVP": 46, 57 | "Other": 255, 58 | "255": 255, # TEMP 59 | } 60 | 61 | 62 | # jsd 63 | def jsd(p, q, type): 64 | p = list(p) 65 | q = list(q) 66 | 67 | if type == "discrete": 68 | # append 0 to shorter arrays: only for IP 69 | pq_max_len = max(len(p), len(q)) 70 | p += [0.0] * (pq_max_len - len(p)) 71 | q += [0.0] * (pq_max_len - len(q)) 72 | assert (len(p) == len(q)) 73 | return distance.jensenshannon(p, q)**2 74 | 75 | elif type == "continuous": 76 | # min_ = min(min(p), min(q)) 77 | # max_ = max(max(p), max(q)) 78 | 79 | min_ = min(p) 80 | max_ = max(p) 81 | 82 | # assume p is raw data 83 | # compute n_bins by FD on raw data; use across baselines 84 | p_counts, p_bin_edges = np.histogram( 85 | p, range=(min_, max_), bins="auto") 86 | q_counts, q_bin_edges = np.histogram( 87 | q, range=(min_, max_), bins=len(p_counts)) 88 | 89 | # out of range 90 | q_arr = np.array(q) 91 | q_arr_lt_realmin = q_arr[q_arr < min_] 92 | q_arr_gt_realmax = q_arr[q_arr > max_] 93 | 94 | if len(q_arr_lt_realmin) > 0: 95 | np.insert(q_counts, 0, len(q_arr_lt_realmin)) 96 | np.insert(p_counts, 0, 0.0) 97 | if len(q_arr_gt_realmax) > 0: 98 | np.append(q_counts, len(q_arr_gt_realmax)) 99 | np.append(p_counts, 0.0) 100 | 101 | return distance.jensenshannon(p_counts, q_counts)**2 102 | 103 | else: 104 | raise ValueError("Unknown JSD data type") 105 | 106 | 107 | def compute_IP_rank_distance(real_list, syn_list, type="EMD"): 108 | real_HH_count = OrderedDict(Counter(real_list).most_common()) 109 | syn_HH_count = OrderedDict(Counter(syn_list).most_common()) 110 | 111 | real_rank_list = [] 112 | idx = 1 113 | for k, v in real_HH_count.items(): 114 | real_rank_list += [idx] * v 115 | idx += 1 116 | 117 | syn_rank_list = [] 118 | idx = 1 119 | for k, v in syn_HH_count.items(): 120 | syn_rank_list += [idx] * v 121 | idx += 1 122 | 123 | if type == "EMD": 124 | return wasserstein_distance(real_rank_list, syn_rank_list) 125 | elif type == "JSD": 126 | return jsd(real_HH_count.values(), 127 | syn_HH_count.values(), type="discrete") 128 | else: 129 | raise ValueError("Unknown distance metric!") 130 | 131 | # type == "freq": return the freq dict 132 | 133 | 134 | def compute_port_proto_distance( 135 | real_list, syn_list, opt, prstr_raw=True, prstr_syn=True, type="TV"): 136 | real_list = list(real_list) 137 | syn_list = list(syn_list) 138 | 139 | # TCP: 6 140 | # UDP: 17 141 | # Other: 255, used for binning other protocols 142 | if opt == "proto": 143 | # convert to integer if protocol is string (e.g., "TCP"/"UDP") 144 | if isinstance(real_list[0], str): 145 | real_list_numeric = [] 146 | for i in real_list: 147 | i = i.strip() 148 | real_list_numeric.append(dict_pr_str2int[i.upper()]) 149 | real_list = real_list_numeric 150 | 151 | if isinstance(syn_list[0], str): 152 | syn_list_numeric = [] 153 | for i in syn_list: 154 | i = i.strip() 155 | syn_list_numeric.append(dict_pr_str2int[i.upper()]) 156 | syn_list = syn_list_numeric 157 | 158 | if opt == "srcport" or opt == "dstport": 159 | real_dict = {} 160 | syn_dict = {} 161 | for i in range(65536): 162 | real_dict[i] = 0 163 | syn_dict[i] = 0 164 | for i in real_list: 165 | real_dict[int(i)] += float(1 / len(real_list)) 166 | for i in syn_list: 167 | if i < 0: 168 | i = 0 169 | elif i > 65535: 170 | i = 65535 171 | syn_dict[int(i)] += float(1 / len(syn_list)) 172 | 173 | if type == "TV": 174 | tv_distance = 0 175 | for i in range(65536): 176 | tv_distance += 0.5 * abs(real_dict[i] - syn_dict[i]) 177 | return tv_distance 178 | elif type == "JSD": 179 | return jsd(real_dict.values(), syn_dict.values(), type="discrete") 180 | elif type == "freq": 181 | return real_dict, syn_dict 182 | else: 183 | raise ValueError("Unknown distance metric!") 184 | 185 | elif opt == "proto": 186 | real_dict = {} 187 | syn_dict = {} 188 | for i in range(256): 189 | real_dict[i] = 0 190 | syn_dict[i] = 0 191 | for i in real_list: 192 | real_dict[int(i)] += float(1 / len(real_list)) 193 | for i in syn_list: 194 | syn_dict[int(i)] += float(1 / len(syn_list)) 195 | 196 | if type == "TV": 197 | tv_distance = 0 198 | for i in range(256): 199 | tv_distance += 0.5 * abs(real_dict[i] - syn_dict[i]) 200 | return tv_distance 201 | elif type == "JSD": 202 | return jsd(real_dict.values(), syn_dict.values(), type="discrete") 203 | elif type == "freq": 204 | return real_dict, syn_dict 205 | else: 206 | raise ValueError("Unknown distance metric!") 207 | 208 | 209 | def get_flowduration(df): 210 | df = df.sort_values("time") 211 | 212 | metadata = ["srcip", "dstip", "srcport", "dstport", "proto"] 213 | gk = df.groupby(by=metadata) 214 | 215 | flow_duration_list = [] 216 | 217 | for name, group in gk: 218 | time_list = list(group["time"]) 219 | flow_duration_list.append(time_list[-1] - time_list[0]) 220 | 221 | return flow_duration_list 222 | 223 | 224 | def compute_metrics_netflow_v3(raw_df, syn_df): 225 | '''JSD + EMD + ranking''' 226 | metrics_dict = {} 227 | 228 | # IP popularity rank 229 | for metric in ["srcip", "dstip"]: 230 | metrics_dict[metric] = compute_IP_rank_distance( 231 | raw_df[metric], syn_df[metric], type="JSD") 232 | 233 | # TV distance for port/protocol 234 | for metric in ["srcport", "dstport", "proto"]: 235 | metrics_dict[metric] = compute_port_proto_distance( 236 | raw_df[metric], 237 | syn_df[metric], 238 | metric, prstr_raw=True, prstr_syn=True, type="JSD") 239 | 240 | # ts, td, pkt, byt 241 | for metric in ["ts", "td", "pkt", "byt"]: 242 | if metric == "ts": 243 | raw_df = raw_df.sort_values("ts").reset_index() 244 | syn_df = syn_df.sort_values("ts").reset_index() 245 | raw_list = list(raw_df["ts"] - raw_df["ts"][0]) 246 | syn_list = list(syn_df["ts"] - syn_df["ts"][0]) 247 | metrics_dict[metric] = wasserstein_distance(raw_list, syn_list) 248 | else: 249 | metrics_dict[metric] = wasserstein_distance( 250 | list(raw_df[metric]), list(syn_df[metric])) 251 | 252 | return metrics_dict 253 | 254 | 255 | def compute_metrics_zeeklog_v3(raw_df, syn_df): 256 | '''JSD + EMD + ranking''' 257 | metrics_dict = {} 258 | 259 | # IP popularity rank 260 | for metric in ["srcip", "dstip"]: 261 | metrics_dict[metric] = compute_IP_rank_distance( 262 | raw_df[metric], syn_df[metric], type="JSD") 263 | 264 | # TV distance for port/protocol 265 | for metric in ["srcport", "dstport", "proto"]: 266 | metrics_dict[metric] = compute_port_proto_distance( 267 | raw_df[metric], 268 | syn_df[metric], 269 | metric, prstr_raw=True, prstr_syn=True, type="JSD") 270 | 271 | # ts,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts, 272 | # orig_ip_bytes,resp_pkts,resp_ip_bytes 273 | for metric in ["ts", "duration", "orig_bytes", "resp_bytes", "missed_bytes", 274 | "orig_pkts", "orig_ip_bytes", "resp_pkts", "resp_ip_bytes"]: 275 | if metric == "ts": 276 | raw_df = raw_df.sort_values("ts").reset_index() 277 | syn_df = syn_df.sort_values("ts").reset_index() 278 | raw_list = list(raw_df["ts"] - raw_df["ts"][0]) 279 | syn_list = list(syn_df["ts"] - syn_df["ts"][0]) 280 | metrics_dict[metric] = wasserstein_distance(raw_list, syn_list) 281 | else: 282 | metrics_dict[metric] = wasserstein_distance( 283 | list(raw_df[metric]), list(syn_df[metric])) 284 | 285 | # TODO: Important!! How to define the JSD of service and conn_state? 286 | 287 | return metrics_dict 288 | 289 | 290 | def compute_metrics_pcap_v3(raw_df, syn_df): 291 | '''JSD + EMD + ranking''' 292 | metrics_dict = {} 293 | 294 | # IP popularity rank 295 | for metric in ["srcip", "dstip"]: 296 | metrics_dict[metric] = compute_IP_rank_distance( 297 | raw_df[metric], syn_df[metric], type="JSD") 298 | 299 | # TV distance for port/protocol 300 | for metric in ["srcport", "dstport", "proto"]: 301 | metrics_dict[metric] = compute_port_proto_distance( 302 | raw_df[metric], 303 | syn_df[metric], 304 | metric, prstr_raw=True, prstr_syn=True, type="JSD") 305 | 306 | # pkt_len 307 | for metric in ["pkt_len", "time"]: 308 | # if metric == "time": 309 | # label = "pkt_arrivalTime" 310 | # else: 311 | # label = metric 312 | 313 | if metric == "time": 314 | raw_df = raw_df.sort_values("time").reset_index() 315 | syn_df = syn_df.sort_values("time").reset_index() 316 | raw_list = list(raw_df["time"] - raw_df["time"][0]) 317 | syn_list = list(syn_df["time"] - syn_df["time"][0]) 318 | metrics_dict[metric] = wasserstein_distance(raw_list, syn_list) 319 | else: 320 | metrics_dict[metric] = wasserstein_distance( 321 | list(raw_df[metric]), list(syn_df[metric])) 322 | 323 | # interarrival time 324 | # raw_df = raw_df.sort_values("time") 325 | # syn_df = syn_df.sort_values("time") 326 | # metrics_dict["PIAT"] = wasserstein_distance(list(np.diff(raw_df["time"])), list(np.diff(syn_df["time"]))) 327 | 328 | # flow size distribution 329 | metadata = ["srcip", "dstip", "srcport", "dstport", "proto"] 330 | raw_gk = raw_df.groupby(by=metadata) 331 | syn_gk = syn_df.groupby(by=metadata) 332 | 333 | raw_flowsize_list = list(raw_gk.size().values) 334 | syn_flowsize_list = list(syn_gk.size().values) 335 | metrics_dict["flow_size"] = wasserstein_distance( 336 | raw_flowsize_list, syn_flowsize_list) 337 | 338 | return metrics_dict 339 | -------------------------------------------------------------------------------- /netshare/pre_post_processors/netshare/embedding_helper.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | 4 | from annoy import AnnoyIndex 5 | from gensim.models import Word2Vec 6 | from sklearn.neighbors import NearestNeighbors 7 | from tqdm import tqdm 8 | 9 | 10 | def build_annoy_dictionary_word2vec( 11 | df, 12 | model_path, 13 | word2vec_cols, 14 | word2vec_size, 15 | n_trees): 16 | 17 | model = Word2Vec.load(model_path) 18 | wv = model.wv 19 | 20 | # type : [cols] 21 | # ("ip": ["srcip", "dstip"]) 22 | # "port": ["srcport", "dstport"] 23 | # "proto": ["proto"] 24 | dict_type_cols = {} 25 | for col in word2vec_cols: 26 | type = col.encoding.split("_")[1] 27 | if type not in dict_type_cols: 28 | dict_type_cols[type] = [] 29 | dict_type_cols[type].append(col.column) 30 | print(dict_type_cols) 31 | 32 | sets = [] 33 | dict_type_annDictPair = {} 34 | for type, cols in dict_type_cols.items(): 35 | type_set = set(list(itertools.chain.from_iterable( 36 | [list(df[col]) for col in cols]))) 37 | type_ann = AnnoyIndex(word2vec_size, 'angular') 38 | type_dict = {} 39 | index = 0 40 | 41 | for ele in type_set: 42 | type_ann.add_item(index, get_vector( 43 | model, str(ele), norm_option=True)) 44 | type_dict[index] = ele 45 | index += 1 46 | type_ann.build(n_trees) 47 | 48 | dict_type_annDictPair[type] = (type_ann, type_dict) 49 | 50 | print("Finish building Angular trees...") 51 | 52 | return dict_type_annDictPair 53 | 54 | 55 | def get_original_obj(ann, vector, dic): 56 | obj_list = ann.get_nns_by_vector( 57 | vector, 1, search_k=-1, include_distances=False) 58 | 59 | return dic[obj_list[0]] 60 | 61 | 62 | def get_original_objs(ann, vectors, dic): 63 | res = [] 64 | for vector in vectors: 65 | obj_list = ann.get_nns_by_vector( 66 | vector, 1, search_k=-1, include_distances=False) 67 | res.append(dic[obj_list[0]]) 68 | return res 69 | 70 | # return vector for the given word 71 | 72 | 73 | def get_vector(model, word, norm_option=False): 74 | all_words_str = list(model.wv.vocab.keys()) 75 | 76 | # Privacy-related 77 | # If word not in the vocabulary, replace with nearest neighbor 78 | # Suppose that protocol is covered 79 | # while very few port numbers are out of range 80 | if word not in all_words_str: 81 | print(f"{word} not in dict") 82 | print("Help!!!!") 83 | all_words = [] 84 | for ele in all_words_str: 85 | if ele.isdigit(): 86 | all_words.append(int(ele)) 87 | all_words = np.array(all_words).reshape((-1, 1)) 88 | nbrs = NearestNeighbors( 89 | n_neighbors=1, algorithm='ball_tree').fit(all_words) 90 | distances, indices = nbrs.kneighbors([[int(word)]]) 91 | nearest_word = str(all_words[indices[0][0]][0]) 92 | # print("nearest_word:", nearest_word) 93 | model.init_sims() 94 | return model.wv.word_vec(nearest_word, use_norm=norm_option) 95 | else: 96 | model.init_sims() 97 | return model.wv.word_vec(word, use_norm=norm_option) 98 | -------------------------------------------------------------------------------- /netshare/pre_post_processors/netshare/main.c: -------------------------------------------------------------------------------- 1 | // https://www.binarytides.com/packet-sniffer-code-c-libpcap-linux-sockets/ 2 | // http://tonylukasavage.com/blog/2010/12/19/offline-packet-capture-analysis-with-c-c----amp--libpcap/ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | /* 17 | pcap and network related 18 | */ 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "packet.h" 28 | 29 | #define ETHER_HDR_TRUNCATE 0 30 | 31 | int tcp = 0, udp = 0, icmp = 0, others = 0, total = 0; 32 | 33 | void packetHandler(u_char *userData, const struct pcap_pkthdr *pkthdr, const u_char *packet) 34 | { 35 | struct ether_header *ep; 36 | unsigned short ether_type; 37 | 38 | // read Ethernet header if not truncated 39 | if (!ETHER_HDR_TRUNCATE) 40 | { 41 | ep = (struct ether_header *)packet; 42 | 43 | // protocol type 44 | ether_type = ntohs(ep->ether_type); 45 | 46 | // IPv4 47 | if (ether_type == ETHERTYPE_IP) 48 | { 49 | packet += ETHER_HDR_LEN; 50 | } 51 | 52 | // 802.1Q 53 | else if (ether_type == ETHERTYPE_VLAN) 54 | { 55 | ether_type = ntohs(*(uint16_t *)(packet + 16)); 56 | 57 | // only process IP packet 58 | if (ether_type == ETHERTYPE_IP) 59 | { 60 | packet += ETHER_HDR_LEN; 61 | packet += 4; 62 | } 63 | 64 | else 65 | return; 66 | } 67 | } 68 | 69 | Packet p; 70 | 71 | /* Timestamp */ 72 | unsigned long time_in_micros = pkthdr->ts.tv_sec * 1000000 + pkthdr->ts.tv_usec; 73 | p.timestamp = time_in_micros; 74 | // printf("timestamp: %lu\n", time_in_micros); 75 | 76 | /* IP header */ 77 | // Note: caida traces does not include ethernet headers 78 | // data_center (IMC 2010): enternet headers, YES 79 | // MACCDC_2012 80 | const struct ip *ipHeader; 81 | char sourceIp[INET_ADDRSTRLEN]; 82 | char destIp[INET_ADDRSTRLEN]; 83 | 84 | ipHeader = (struct ip *)(packet); 85 | 86 | inet_ntop(AF_INET, &(ipHeader->ip_src), sourceIp, INET_ADDRSTRLEN); 87 | inet_ntop(AF_INET, &(ipHeader->ip_dst), destIp, INET_ADDRSTRLEN); 88 | 89 | struct in_addr tmpPkt1, tmpPkt2; 90 | inet_aton(sourceIp, &tmpPkt1); 91 | inet_aton(destIp, &tmpPkt2); 92 | 93 | p.srcip = ntohl(tmpPkt1.s_addr); 94 | p.dstip = ntohl(tmpPkt2.s_addr); 95 | 96 | p.ip_hl = (unsigned int)ipHeader->ip_hl; 97 | p.ip_v = (unsigned int)ipHeader->ip_v; 98 | p.ip_tos = (uint8_t)ipHeader->ip_tos; 99 | p.ip_len = ntohs(ipHeader->ip_len); 100 | p.ip_id = ntohs(ipHeader->ip_id); 101 | p.ip_off = ntohs(ipHeader->ip_off); 102 | p.ip_ttl = (uint8_t)ipHeader->ip_ttl; 103 | p.ip_p = (uint8_t)ipHeader->ip_p; 104 | p.ip_sum = ntohs(ipHeader->ip_sum); 105 | 106 | // TCP/UDP 107 | total++; 108 | switch (ipHeader->ip_p) 109 | { 110 | // ICMP Protocol 111 | case 1: 112 | icmp++; 113 | break; 114 | 115 | // TCP Protocol 116 | case 6: 117 | tcp++; 118 | 119 | struct tcphdr *tcpHeader = (struct tcphdr *)(packet + p.ip_hl * 4); 120 | p.srcport = ntohs(tcpHeader->th_sport); 121 | p.dstport = ntohs(tcpHeader->th_dport); 122 | // printf("%hu, %hu\n", p.srcport, p.dstport); 123 | break; 124 | 125 | // UDP Protocol 126 | case 17: 127 | udp++; 128 | struct udphdr *udpHeader = (struct udphdr *)(packet + p.ip_hl * 4); 129 | p.srcport = ntohs(udpHeader->uh_sport); 130 | p.dstport = ntohs(udpHeader->uh_dport); 131 | // printf("%hu, %hu\n", p.srcport, p.dstport); 132 | break; 133 | 134 | default: 135 | others++; 136 | break; 137 | } 138 | 139 | // printf("%u, %u, %hu, %hu, %u, %lu, %hu, %u, %u, %u, %hu, %u, %hu\n", p.srcip, p.dstip, p.srcport, p.dstport, p.ip_p, p.timestamp, p.ip_len, p.ip_v, p.ip_hl, p.ip_tos, p.ip_id, p.ip_ttl, p.ip_sum); 140 | 141 | trace_pkts = (Packet *)realloc(trace_pkts, (trace_count + 1) * sizeof(Packet)); 142 | trace_pkts[trace_count] = p; 143 | 144 | trace_count++; 145 | } 146 | 147 | void pcapParser(char *fileName) 148 | { 149 | pcap_t *descr; 150 | char errbuf[PCAP_ERRBUF_SIZE]; 151 | 152 | // open trace file for offline processing 153 | printf("Pre-process pcap file %s\n", fileName); 154 | trace_count = 0; 155 | 156 | descr = pcap_open_offline(fileName, errbuf); 157 | 158 | if (descr == NULL) 159 | { 160 | printf("[FILE ERROR] pcap_open_live() failed: \n"); 161 | } 162 | 163 | // start packet processing loop, just like live capture 164 | if (pcap_loop(descr, 0, packetHandler, NULL) < 0) 165 | { 166 | printf("pcap_loop() failed: %s\n", pcap_geterr(descr)); 167 | } 168 | 169 | printf("This pcap chunk reading is done... total %d packets \n", trace_count); 170 | } 171 | 172 | int pcap2csv(char *pcapFile, char *csvFile) 173 | { 174 | printf("pcap file: %s\n", pcapFile); 175 | printf("csv file: %s\n", csvFile); 176 | 177 | pcapParser(pcapFile); 178 | printf("TCP: %d, UDP: %d, ICMP: %d, Others: %d, total: %d\n", tcp, udp, icmp, others, total); 179 | 180 | FILE *fp; 181 | fp = fopen(csvFile, "w+"); 182 | 183 | fprintf(fp, "srcip,dstip,srcport,dstport,proto,time,pkt_len,version,ihl,tos,id,flag,off,ttl,chksum\n"); 184 | 185 | for (int i = 0; i < trace_count; i++) 186 | { 187 | Packet p = trace_pkts[i]; 188 | 189 | unsigned short int ip_flag = p.ip_off >> 13; 190 | unsigned short int ip_off = p.ip_off & IP_OFFMASK; 191 | 192 | char proto[128]; 193 | if (p.ip_p == 6) 194 | { 195 | strcpy(proto, "TCP"); 196 | } 197 | else if (p.ip_p == 17) 198 | { 199 | strcpy(proto, "UDP"); 200 | } 201 | else 202 | { 203 | printf("Not TCP/UDP packet!\n"); 204 | } 205 | 206 | fprintf(fp, "%u,%u,%hu,%hu,%s,%lu,%hu,%u,%u,%u,%hu,%hu,%hu,%u,%hu\n", p.srcip, p.dstip, p.srcport, p.dstport, proto, p.timestamp, p.ip_len, p.ip_v, p.ip_hl, p.ip_tos, p.ip_id, ip_flag, ip_off, p.ip_ttl, p.ip_sum); 207 | } 208 | 209 | fclose(fp); 210 | 211 | return 0; 212 | } -------------------------------------------------------------------------------- /netshare/pre_post_processors/netshare/packet.h: -------------------------------------------------------------------------------- 1 | #ifndef PACKET_H 2 | #define PACKET_H 3 | 4 | // http://yuba.stanford.edu/~casado/pcap/section4.html 5 | 6 | #include 7 | 8 | // Self-defined packet structure 9 | typedef struct Packet 10 | { 11 | // Timestamp in microseconds 12 | unsigned long timestamp; /* timestamp */ 13 | 14 | // IP header 15 | unsigned int ip_hl; /* header length */ 16 | unsigned int ip_v; /* version */ 17 | uint8_t ip_tos; /* type of service */ 18 | u_short ip_len; /* total length */ 19 | u_short ip_id; /* identification */ 20 | u_short ip_off; /* fragment offset field */ 21 | uint8_t ip_ttl; /* time to live */ 22 | uint8_t ip_p; /* protocol */ 23 | u_short ip_sum; /* checksum */ 24 | uint32_t srcip; /* source IP */ 25 | uint32_t dstip; /* destination IP */ 26 | 27 | // TCP/UDP 28 | u_short srcport; /* source port */ 29 | u_short dstport; /* destination port */ 30 | }Packet; 31 | 32 | Packet* trace_pkts; 33 | int trace_count = 0; 34 | 35 | #endif -------------------------------------------------------------------------------- /netshare/pre_post_processors/netshare/sharedlib.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cc -fPIC -shared -o pcap2csv.so main.c -lm -lpcap -------------------------------------------------------------------------------- /netshare/pre_post_processors/netshare/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import pickle 4 | import math 5 | import json 6 | import ast 7 | import socket 8 | import struct 9 | import ipaddress 10 | import pandas as pd 11 | import numpy as np 12 | from tqdm import tqdm 13 | from pathlib import Path 14 | from scapy.all import IP, ICMP, TCP, UDP 15 | from scapy.all import wrpcap 16 | from scipy.stats import rankdata 17 | from .embedding_helper import ( 18 | build_annoy_dictionary_word2vec, 19 | get_original_obj 20 | ) 21 | from .dist_metrics import ( 22 | compute_metrics_netflow_v3, 23 | compute_metrics_pcap_v3, 24 | compute_metrics_zeeklog_v3 25 | ) 26 | from ...model_managers.netshare_manager.netshare_util import get_configid_from_kv 27 | 28 | 29 | def convert_sdmetricsConfigQuant_to_fieldValueDict( 30 | sdmetricsConfigQuant 31 | ): 32 | '''Convert the sdmetricsConfigQuant to fieldValueDict 33 | Args: 34 | sdmetricsConfigQuant (dict): returned by create_sdmetrics_config(..., comparison_type='quantitative') 35 | Returns: 36 | fieldValueDict (dict): {field_name: value} 37 | ''' 38 | 39 | fieldValueDict = {} 40 | for metric_type, metrics in sdmetricsConfigQuant.items(): 41 | for metric_class_name, metric_class in metrics.items(): 42 | # metrics with target (e.g., attr dist similarity) 43 | if isinstance(metric_class, dict): 44 | for field_name, field_value in metric_class.items(): 45 | fieldValueDict[ast.literal_eval( 46 | field_name)[0]] = field_value[0][0] 47 | # metrics without target (e.g., session length) 48 | elif isinstance(metric_class, list): 49 | fieldValueDict[metric_class_name] = metric_class[0][0] 50 | 51 | return fieldValueDict 52 | 53 | 54 | def create_sdmetrics_config( 55 | config_pre_post_processor, 56 | comparison_type='both' 57 | ): 58 | # Refer to https://github.com/netsharecmu/SDMetrics_timeseries/blob/master/sdmetrics/reports/timeseries/sunglasses_qr.json to see the format of the config file 59 | sdmetrics_config = { 60 | "metadata": { 61 | "fields": {} 62 | }, 63 | "config": { 64 | "metrics": { 65 | "fidelity": [] 66 | } 67 | } 68 | } 69 | 70 | # Enumerate through all the fields in the metadata, timeseries, and timestamp 71 | for i, field in enumerate(config_pre_post_processor.metadata + 72 | config_pre_post_processor.timeseries): 73 | if field in config_pre_post_processor.metadata: 74 | metric_class_name = "Single attribute distributional similarity" 75 | class_name = "AttrDistSimilarity" 76 | elif field in config_pre_post_processor.timeseries: 77 | metric_class_name = "Single feature distributional similarity" 78 | class_name = "FeatureDistSimilarity" 79 | if 'bit' in getattr(field, 'encoding', '') or \ 80 | 'word2vec' in getattr(field, 'encoding', '') or \ 81 | 'categorical' in getattr(field, 'encoding', ''): 82 | sdmetrics_config["metadata"]["fields"][ 83 | field.column] = { 84 | "type": "categorical"} 85 | if getattr(field, 'type', '') == 'float': 86 | sdmetrics_config["metadata"]["fields"][ 87 | field.column] = { 88 | "type": "numerical"} 89 | sdmetrics_config["config"]["metrics"]["fidelity"].append( 90 | { 91 | metric_class_name: { 92 | "class": class_name, 93 | "target_list": [[field.column]], 94 | "configs": { 95 | "categorical_mapping": getattr(field, 'categorical_mapping', True), 96 | "comparison_type": comparison_type 97 | } 98 | } 99 | } 100 | ) 101 | 102 | # Add session length metric if the dataset is a pcap 103 | if config_pre_post_processor.dataset_type == 'pcap': 104 | sdmetrics_config["config"]["metrics"]["fidelity"].append( 105 | { 106 | "Session length distributional similarity": { 107 | "class": "SessionLengthDistSimilarity", 108 | "configs": { 109 | "comparison_type": comparison_type 110 | } 111 | } 112 | } 113 | ) 114 | if config_pre_post_processor.timestamp.generation: 115 | sdmetrics_config["metadata"]["fields"][ 116 | config_pre_post_processor.timestamp.column] = { 117 | "type": "numerical"} 118 | sdmetrics_config["config"]["metrics"]["fidelity"].append( 119 | { 120 | "Single feature distributional similarity": { 121 | "class": "FeatureDistSimilarity", 122 | "target_list": [ 123 | [ 124 | config_pre_post_processor.timestamp.column 125 | ] 126 | ], 127 | "configs": { 128 | "comparison_type": comparison_type 129 | } 130 | } 131 | } 132 | ) 133 | sdmetrics_config["metadata"]["entity_columns"] = [ 134 | field.column for field in config_pre_post_processor.metadata 135 | ] 136 | sdmetrics_config["metadata"]["sequence_index"] = config_pre_post_processor.timestamp.column if config_pre_post_processor.timestamp.generation else None 137 | sdmetrics_config["metadata"]["context_columns"] = [] 138 | 139 | return sdmetrics_config 140 | 141 | 142 | def _last_lvl_folder(folder): 143 | return str(Path(folder).parents[0]) 144 | 145 | 146 | def IP_int2str(IP_int): 147 | return str(ipaddress.ip_address(IP_int)) 148 | 149 | 150 | def IP_str2int(IP_str): 151 | return int(ipaddress.ip_address(IP_str)) 152 | 153 | 154 | def IPs_int2str(IPs_int): 155 | return [IP_int2str(i) for i in IPs_int] 156 | 157 | 158 | def IPs_str2int(IPs_str): 159 | return [IP_str2int(i) for i in IPs_str] 160 | 161 | 162 | pr_dict = { 163 | "ESP": 50, 164 | "GRE": 47, 165 | "ICMP": 1, 166 | "IPIP": 4, 167 | "IPv6": 41, 168 | "TCP": 6, 169 | "UDP": 17, 170 | "Other": 255 171 | } 172 | 173 | 174 | def prs_str2int(prs): 175 | prs_int = [] 176 | for p in prs: 177 | prs_int.append(pr_dict[p]) 178 | return prs_int 179 | 180 | 181 | pr_int2str_dict = { 182 | 1: "ICMP", 183 | 4: "IPIP", 184 | 6: "TCP", 185 | 17: "UDP", 186 | 41: "IPv6", 187 | 47: "GRE", 188 | 50: "ESP", 189 | 255: "Other" 190 | } 191 | 192 | 193 | def prs_int2str(prs_int): 194 | prs_str = [] 195 | for p in prs_int: 196 | prs_str.append(pr_int2str_dict[p]) 197 | return prs_str 198 | 199 | 200 | def csv2pcap_single(input, output): 201 | # df = pd.read_csv(input).sort_values(["time"]) 202 | df = input.sort_values(["time"]) 203 | 204 | packets = [] 205 | 206 | for i, row in tqdm(df.iterrows(), total=df.shape[0]): 207 | time = float(row["time"] / 10**6) 208 | if isinstance(row["srcip"], str): 209 | srcip = IP_str2int(row["srcip"]) 210 | dstip = IP_str2int(row["dstip"]) 211 | src = socket.inet_ntoa(struct.pack('!L', srcip)) 212 | dst = socket.inet_ntoa(struct.pack('!L', dstip)) 213 | else: 214 | src = socket.inet_ntoa(struct.pack('!L', row["srcip"])) 215 | dst = socket.inet_ntoa(struct.pack('!L', row["dstip"])) 216 | 217 | srcport = row["srcport"] 218 | dstport = row["dstport"] 219 | proto = row["proto"] 220 | pkt_len = int(row["pkt_len"]) 221 | 222 | try: 223 | proto = int(proto) 224 | except BaseException: 225 | if proto == "TCP": 226 | proto = 6 227 | elif proto == "UDP": 228 | proto = 17 229 | elif proto == "ICMP": 230 | proto = 1 231 | else: 232 | proto = 0 233 | 234 | ip = IP(src=src, dst=dst, len=pkt_len, proto=proto) 235 | if proto == 1: 236 | p = ip / ICMP() 237 | elif proto == 6: 238 | tcp = TCP(sport=srcport, dport=dstport) 239 | p = ip / tcp 240 | elif proto == 17: 241 | udp = UDP(sport=srcport, dport=dstport) 242 | p = ip / udp 243 | else: 244 | p = ip 245 | 246 | p.time = time 247 | p.len = pkt_len 248 | p.wirelen = pkt_len + 4 249 | 250 | packets.append(p) 251 | 252 | wrpcap(output, packets) 253 | -------------------------------------------------------------------------------- /netshare/pre_post_processors/netshare/word2vec_embedding.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | from gensim.models import Word2Vec 5 | import pandas as pd 6 | import numpy as np 7 | 8 | from .embedding_helper import build_annoy_dictionary_word2vec 9 | from .embedding_helper import get_original_obj, get_vector 10 | from sklearn.neighbors import NearestNeighbors 11 | 12 | 13 | def test_embed_bidirectional(model_path, ann, dic, word): 14 | model = Word2Vec.load(model_path) 15 | 16 | raw_vec = get_vector(model, word, False) 17 | normed_vec = get_vector(model, word, True) 18 | 19 | print("word: {}, vector(raw): {}".format(word, raw_vec)) 20 | print("word: {}, vector(l2-norm): {}".format(word, normed_vec)) 21 | 22 | print("vec(raw): {}, word: {}".format( 23 | raw_vec, get_original_obj(ann, raw_vec, dic))) 24 | print("vec(l2-norm): {}, word: {}".format(normed_vec, 25 | get_original_obj(ann, normed_vec, dic))) 26 | print() 27 | 28 | 29 | def test_model( 30 | df, 31 | model_path, 32 | word2vec_cols, 33 | word2vec_size, 34 | annoy_n_trees 35 | ): 36 | dict_type_annDictPair = build_annoy_dictionary_word2vec( 37 | df=df, 38 | model_path=model_path, 39 | word2vec_cols=word2vec_cols, 40 | word2vec_size=word2vec_size, 41 | n_trees=annoy_n_trees 42 | ) 43 | 44 | for col in word2vec_cols: 45 | type = col.encoding.split("_")[1] 46 | word = random.choice(df[col.column]) 47 | print("Testing {col.column}...") 48 | test_embed_bidirectional( 49 | model_path=model_path, 50 | ann=dict_type_annDictPair[type][0], 51 | dic=dict_type_annDictPair[type][1], 52 | word=word) 53 | 54 | 55 | def word2vec_train( 56 | df, 57 | out_dir, 58 | model_name, 59 | word2vec_cols, 60 | word2vec_size, 61 | annoy_n_trees, 62 | force_retrain=False, # retrain from scratch 63 | model_test=False 64 | ): 65 | model_path = os.path.join( 66 | out_dir, 67 | "{}_{}.model".format(model_name, word2vec_size)) 68 | 69 | if os.path.exists(model_path) and not force_retrain: 70 | print("Loading Word2Vec pre-trained model...") 71 | model = Word2Vec.load(model_path) 72 | else: 73 | print("Training Word2Vec model from scratch...") 74 | sentences = [] 75 | for row in range(0, len(df)): 76 | sentence = [str(df.at[row, col]) 77 | for col in [c.column for c in word2vec_cols]] 78 | sentences.append(sentence) 79 | 80 | model = Word2Vec( 81 | sentences=sentences, 82 | size=word2vec_size, 83 | window=5, 84 | min_count=1, 85 | workers=10) 86 | model.save(model_path) 87 | print(f"Word2Vec model is saved at {model_path}") 88 | 89 | if model_test: 90 | test_model( 91 | df=df, 92 | model_path=model_path, 93 | word2vec_cols=word2vec_cols, 94 | word2vec_size=word2vec_size, 95 | annoy_n_trees=annoy_n_trees 96 | ) 97 | 98 | return model_path 99 | -------------------------------------------------------------------------------- /netshare/pre_post_processors/pre_post_processor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import os 3 | 4 | from netshare.utils import Tee 5 | 6 | 7 | class PrePostProcessor(ABC): 8 | def __init__(self, config): 9 | self._config = config 10 | 11 | @abstractmethod 12 | def _pre_process(self, input_folder, output_folder, log_folder): 13 | ... 14 | 15 | @abstractmethod 16 | def _post_process(self, input_folder, output_folder, 17 | pre_processed_data_folder, log_folder): 18 | ... 19 | 20 | def pre_process(self, input_folder, output_folder, log_folder): 21 | stdout_log_path = os.path.join(log_folder, 'pre_process.stdout.log') 22 | stderr_log_path = os.path.join(log_folder, 'pre_process.stderr.log') 23 | with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path): 24 | return self._pre_process( 25 | input_folder=input_folder, 26 | output_folder=output_folder, 27 | log_folder=log_folder) 28 | 29 | def post_process(self, input_folder, output_folder, 30 | pre_processed_data_folder, log_folder): 31 | stdout_log_path = os.path.join(log_folder, 'post_process.stdout.log') 32 | stderr_log_path = os.path.join(log_folder, 'post_process.stderr.log') 33 | with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path): 34 | return self._post_process( 35 | input_folder=input_folder, 36 | output_folder=output_folder, 37 | pre_processed_data_folder=pre_processed_data_folder, 38 | log_folder=log_folder) 39 | -------------------------------------------------------------------------------- /netshare/ray/__init__.py: -------------------------------------------------------------------------------- 1 | from .remote import remote, get 2 | from .config import config 3 | from .ray_functions import init, shutdown 4 | 5 | 6 | __all__ = ['config', 'init', 'shutdown', 'remote', 'get'] 7 | -------------------------------------------------------------------------------- /netshare/ray/config.py: -------------------------------------------------------------------------------- 1 | from addict import Dict 2 | 3 | config = Dict( 4 | enabled=True) 5 | config.freeze() 6 | -------------------------------------------------------------------------------- /netshare/ray/ray_functions.py: -------------------------------------------------------------------------------- 1 | from .config import config as ray_config 2 | 3 | def init(*args, **kwargs): 4 | if ray_config.enabled: 5 | print('Ray is enabled') 6 | import ray 7 | ray.init(*args, **kwargs) 8 | else: 9 | print('Ray is disabled') 10 | 11 | 12 | def shutdown(*args, **kargs): 13 | if ray_config.enabled: 14 | print('Ray is enabled') 15 | import ray 16 | ray.shutdown(*args, **kargs) 17 | else: 18 | print("Ray is disabled") 19 | -------------------------------------------------------------------------------- /netshare/ray/remote.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from .config import config as ray_config 3 | 4 | 5 | class ResultWrapper(object): 6 | def __init__(self, result): 7 | self._result = result 8 | 9 | def get_result(self): 10 | return self._result 11 | 12 | 13 | class RemoteFunctionWrapper(object): 14 | def __init__(self, *args, **kwargs): 15 | self._actual_remote_function = None 16 | self._ray_args = args 17 | self._ray_kwargs = kwargs 18 | 19 | def __call__(self, *args, **kwargs): 20 | raise TypeError('Remote functions cannot be called directly.') 21 | 22 | def remote(self, *args, **kwargs): 23 | if ray_config.enabled: 24 | if self._actual_remote_function is None: 25 | import ray 26 | if len(self._ray_kwargs) == 0: 27 | self._actual_remote_function = ray.remote( 28 | *self._ray_args, **self._ray_kwargs) 29 | else: 30 | self._actual_remote_function = ray.remote( 31 | **self._ray_kwargs)(*self._ray_args) 32 | return self._actual_remote_function.remote(*args, **kwargs) 33 | else: 34 | return ResultWrapper(self._ray_args[0](*args, **kwargs)) 35 | 36 | 37 | def remote(*args, **kwargs): 38 | if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): 39 | # This is the case where the decorator is just @ray.remote. 40 | # "args[0]" is the class or function under the decorator. 41 | return RemoteFunctionWrapper(args[0]) 42 | if not (len(args) == 0 and len(kwargs) > 0): 43 | raise ValueError('Error in the parameters of the decorator') 44 | return functools.partial(RemoteFunctionWrapper, **kwargs) 45 | 46 | 47 | def get(object_refs, **kwargs): 48 | if ray_config.enabled: 49 | import ray 50 | return ray.get(object_refs, **kwargs) 51 | else: 52 | if isinstance(object_refs, ResultWrapper): 53 | return object_refs.get_result() 54 | elif isinstance(object_refs, list): 55 | return [object_ref.get_result() for object_ref in object_refs] 56 | -------------------------------------------------------------------------------- /netshare/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .tee import Tee 2 | from .field import ContinuousField, DiscreteField, BitField, Word2VecField 3 | from .output import OutputType, Normalization, Output 4 | from .exec_cmd import exec_cmd 5 | 6 | __all__ = ['Tee', 'ContinuousField', 'DiscreteField', 'BitField', 7 | 'Word2VecField', 'OutputType', 'Normalization', 'Output', 'exec_cmd'] 8 | -------------------------------------------------------------------------------- /netshare/utils/exec_cmd.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | 4 | def exec_cmd(cmd, wait=False): 5 | p = subprocess.Popen(cmd, shell=True) 6 | if wait: 7 | p.wait() 8 | -------------------------------------------------------------------------------- /netshare/utils/field.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | import pandas as pd 5 | from typing import Any, Dict, List 6 | from collections import defaultdict 7 | from annoy import AnnoyIndex 8 | 9 | from .output import Normalization, OutputType, Output 10 | from ..pre_post_processors.netshare.embedding_helper import get_vector, get_original_obj, get_original_objs 11 | 12 | EPS = 1e-8 13 | 14 | 15 | class Field(object): 16 | def __init__(self, name): 17 | self.name = name 18 | 19 | def normalize(self): 20 | raise NotImplementedError 21 | 22 | def denormalize(self): 23 | raise NotImplementedError 24 | 25 | def getOutputType(self): 26 | raise NotImplementedError 27 | 28 | 29 | class ContinuousField(Field): 30 | def __init__( 31 | self, norm_option, min_x=None, max_x=None, dim_x=1, 32 | log1p_norm=False, *args, **kwargs): 33 | super(ContinuousField, self).__init__(*args, **kwargs) 34 | 35 | self.min_x = min_x 36 | self.max_x = max_x 37 | self.norm_option = norm_option 38 | self.dim_x = dim_x 39 | self.log1p_norm = log1p_norm 40 | if self.log1p_norm: 41 | self.min_x = np.log1p(self.min_x) 42 | self.max_x = np.log1p(self.max_x) 43 | 44 | # Normalize x in [a, b]: x' = (b-a)(x-min x)/(max x - minx) + a 45 | def normalize(self, x): 46 | if x.shape[-1] != self.dim_x: 47 | raise ValueError(f"Dimension is {x.shape[-1]}. " 48 | f"Expected dimension is {self.dim_x}") 49 | if self.log1p_norm: 50 | x = np.log1p(x) 51 | 52 | # [0, 1] normalization 53 | if self.norm_option == Normalization.ZERO_ONE: 54 | return np.asarray((x - self.min_x) / (self.max_x - self.min_x)) 55 | 56 | # [-1, 1] normalization 57 | elif self.norm_option == Normalization.MINUSONE_ONE: 58 | return np.asarray(2 * (x - self.min_x) 59 | / (self.max_x - self.min_x) - 1) 60 | else: 61 | raise Exception("Not valid normalization option!") 62 | 63 | def denormalize(self, norm_x): 64 | if norm_x.shape[-1] != self.dim_x: 65 | raise ValueError(f"Dimension is {norm_x.shape[-1]}. " 66 | f"Expected dimension is {self.dim_x}") 67 | norm_x = norm_x.astype(np.float64) # Convert to float64 for precision 68 | 69 | # [0, 1] normalization 70 | if self.norm_option == Normalization.ZERO_ONE: 71 | to_return = norm_x * float(self.max_x - self.min_x) + self.min_x 72 | 73 | # [-1, 1] normalization 74 | elif self.norm_option == Normalization.MINUSONE_ONE: 75 | to_return = (norm_x + 1) / 2.0 * \ 76 | float(self.max_x - self.min_x) + self.min_x 77 | 78 | else: 79 | raise Exception("Not valid normalization option!") 80 | 81 | if self.log1p_norm: 82 | to_return = np.expm1(to_return) 83 | 84 | return to_return 85 | 86 | def getOutputType(self): 87 | return Output( 88 | type_=OutputType.CONTINUOUS, 89 | dim=self.dim_x, 90 | normalization=self.norm_option 91 | ) 92 | 93 | 94 | class DiscreteField(Field): 95 | def __init__(self, choices, *args, **kwargs): 96 | super(DiscreteField, self).__init__(*args, **kwargs) 97 | 98 | if not isinstance(choices, list): 99 | raise Exception("choices should be a list") 100 | self.choices = choices 101 | self.dim_x = len(choices) 102 | 103 | def normalize(self, x): 104 | if not isinstance(x, (list, np.ndarray)): 105 | norm_x = [x] 106 | else: 107 | norm_x = x 108 | norm_x = pd.DataFrame(norm_x).astype( 109 | pd.CategoricalDtype(categories=self.choices)) 110 | norm_x = pd.get_dummies(norm_x).to_numpy() 111 | if not isinstance(x, (list, np.ndarray)): 112 | norm_x = norm_x[0] 113 | 114 | return norm_x 115 | 116 | def denormalize(self, norm_x): 117 | index = np.argmax(norm_x, axis=-1) 118 | 119 | return np.asarray(self.choices)[index] 120 | 121 | def getOutputType(self): 122 | return Output( 123 | type_=OutputType.DISCRETE, 124 | dim=len(self.choices) 125 | ) 126 | 127 | 128 | class BitField(Field): 129 | def __init__(self, num_bits, *args, **kwargs): 130 | super(BitField, self).__init__(*args, **kwargs) 131 | 132 | self.num_bits = num_bits 133 | self.dim_x = 2*num_bits 134 | 135 | def normalize(self, decimal_x): 136 | bin_x = bin(int(decimal_x))[2:].zfill(self.num_bits) 137 | bin_x = [int(b) for b in bin_x] 138 | 139 | bits = [] 140 | for b in bin_x: 141 | if b == 0: 142 | bits += [1.0, 0.0] 143 | 144 | elif b == 1: 145 | bits += [0.0, 1.0] 146 | 147 | else: 148 | print("Binary number is zero or one!") 149 | 150 | return bits 151 | 152 | def denormalize(self, bin_x): 153 | if len(bin_x.shape) == 3: 154 | # This is a timeseries field 155 | a, b, c = bin_x.shape 156 | if self.num_bits * 2 != c: 157 | raise ValueError( 158 | f"Dimension is {c}. Expected dimension is {self.num_bits * 2}" 159 | ) 160 | return self.denormalize( 161 | bin_x.reshape(a * b, c)).to_numpy().reshape( 162 | a, b) 163 | df_bin = pd.DataFrame(bin_x) 164 | chosen_bits = (df_bin > df_bin.shift(axis=1)).drop( 165 | range(0, self.num_bits * 2, 2), axis=1 166 | ) 167 | return chosen_bits.dot(1 << np.arange(self.num_bits - 1, -1, -1)) 168 | 169 | def getOutputType(self): 170 | outputs = [] 171 | 172 | for i in range(self.num_bits): 173 | outputs.append(Output(type_=OutputType.DISCRETE, dim=2)) 174 | 175 | return outputs 176 | 177 | 178 | class Word2VecField(Field): 179 | def __init__( 180 | self, word2vec_size, pre_processed_data_folder, word2vec_type, * 181 | args, **kwargs): 182 | super(Word2VecField, self).__init__(*args, **kwargs) 183 | 184 | self.word2vec_size = word2vec_size 185 | self.preprocessed_data_folder = pre_processed_data_folder 186 | self.word2vec_type = word2vec_type 187 | self.dim_x = word2vec_size 188 | self.norm_option = Normalization.MINUSONE_ONE 189 | 190 | def normalize(self, x, embed_model): 191 | return np.array( 192 | [get_vector(embed_model, str(xi), norm_option=True) for xi in x] 193 | ) 194 | 195 | def denormalize(self, norm_x): 196 | # load Annoy and Dict 197 | type_ann = AnnoyIndex(self.word2vec_size, 'angular') 198 | type_ann.load(os.path.join( 199 | self.preprocessed_data_folder, 200 | f"{self.word2vec_type}_ann.ann")) 201 | with open(os.path.join(self.preprocessed_data_folder, f"{self.word2vec_type}_dict.json"), 'r') as f: 202 | type_dict = json.load(f) 203 | 204 | if len(norm_x.shape) == 3: 205 | # This is a timeseries field 206 | return np.array( 207 | [ 208 | get_original_objs( 209 | ann=type_ann, 210 | vectors=x, 211 | dic={int(k): v for k, v in type_dict.items()} 212 | ) 213 | for x in norm_x 214 | ] 215 | ) 216 | return np.asarray(get_original_objs( 217 | ann=type_ann, 218 | vectors=norm_x, 219 | dic={int(k): v for k, v in type_dict.items()} 220 | )) 221 | 222 | def getOutputType(self): 223 | return Output( 224 | type_=OutputType.CONTINUOUS, 225 | dim=self.dim_x, 226 | normalization=self.norm_option 227 | ) 228 | -------------------------------------------------------------------------------- /netshare/utils/logger.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import io 3 | import logging 4 | import sys 5 | 6 | logger: logging.Logger = logging.getLogger("netshare") 7 | 8 | handler = logging.StreamHandler(sys.stdout) 9 | formatter = logging.Formatter( 10 | "%(asctime)s - %(levelname)s - %(funcName)s - %(message)s" 11 | ) 12 | handler.setFormatter(formatter) 13 | logger.addHandler(handler) 14 | logger.setLevel(logging.INFO) 15 | 16 | 17 | class TqdmToLogger(io.StringIO): 18 | """ 19 | Util to output tqdm progress bar to the logger. 20 | """ 21 | 22 | def __init__(self, description: str) -> None: 23 | super().__init__() 24 | self.description = description 25 | 26 | def write(self, buf: str) -> int: 27 | if buf.strip(): 28 | logger.debug(f"{self.description}: {buf.strip()}") 29 | return len(buf) 30 | -------------------------------------------------------------------------------- /netshare/utils/output.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class OutputType(Enum): 5 | CONTINUOUS = "CONTINUOUS" 6 | DISCRETE = "DISCRETE" 7 | 8 | 9 | class Normalization(Enum): 10 | ZERO_ONE = "ZERO_ONE" 11 | MINUSONE_ONE = "MINUSONE_ONE" 12 | 13 | 14 | class Output(object): 15 | def __init__(self, type_, dim, normalization=None, is_gen_flag=False): 16 | self.type_ = type_ 17 | self.dim = dim 18 | self.normalization = normalization 19 | self.is_gen_flag = is_gen_flag 20 | 21 | if type_ == OutputType.CONTINUOUS and normalization is None: 22 | raise Exception("normalization must be set for continuous output") 23 | -------------------------------------------------------------------------------- /netshare/utils/tee.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import traceback 3 | 4 | 5 | class DuplicateWriter(object): 6 | def __init__(self, file_objects): 7 | self._file_objects = file_objects 8 | 9 | def write(self, data): 10 | for file_object in self._file_objects: 11 | file_object.write(data) 12 | file_object.flush() 13 | 14 | def writelines(self, data): 15 | for file_object in self._file_objects: 16 | file_object.write(data) 17 | file_object.flush() 18 | 19 | def flush(self): 20 | for file_object in self._file_objects: 21 | file_object.flush() 22 | 23 | def close(self): 24 | for file_object in self._file_objects: 25 | file_object.close() 26 | 27 | 28 | class Tee(object): 29 | def __init__(self, stdout_path, stderr_path): 30 | self.stdout_file = open(stdout_path, 'w') 31 | self.stderr_file = open(stderr_path, 'w') 32 | self.stdout = sys.stdout 33 | self.stderr = sys.stderr 34 | self.stdout_writer = DuplicateWriter([sys.stdout, self.stdout_file]) 35 | self.stderr_writer = DuplicateWriter([sys.stderr, self.stderr_file]) 36 | 37 | def __enter__(self): 38 | sys.stdout = self.stdout_writer 39 | sys.stderr = self.stderr_writer 40 | 41 | def __exit__(self, exc_type, exc, exc_tb): 42 | sys.stdout = self.stdout 43 | sys.stderr = self.stderr 44 | if exc_type is not None: 45 | self.stderr_writer.write(traceback.format_exc()) 46 | self.stderr_writer.flush() 47 | self.stdout_writer.flush() 48 | self.stderr_file.close() 49 | self.stdout_file.close() 50 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from pathlib import Path 3 | 4 | VERSION = "0.0.1" 5 | DESCRIPTION = "NetShare" 6 | this_directory = Path(__file__).parent 7 | LONG_DESCRIPTION = (this_directory / "README.md").read_text() 8 | 9 | # Setting up 10 | setup( 11 | # the name must match the folder name 'verysimplemodule' 12 | name="netshare", 13 | version=VERSION, 14 | author="Yucheng Yin, Zinan Lin, Minhao Jin, Giulia Fanti, Vyas Sekar", 15 | author_email="yyin4@andrew.cmu.edu", 16 | description=DESCRIPTION, 17 | long_description=LONG_DESCRIPTION, 18 | long_description_content_type="text/markdown", 19 | packages=find_packages(), 20 | install_requires=[ 21 | "torch", 22 | "tensorboard", 23 | "opacus", 24 | "tqdm", 25 | "matplotlib", 26 | "pandas", 27 | "scikit-learn", 28 | "more-itertools", 29 | "gensim==3.8.3", 30 | "networkx", 31 | "notebook", 32 | "ipyplot", 33 | "jupyterlab", 34 | "statsmodels", 35 | "gdown", 36 | "annoy==1.17.1", 37 | "pyshark", 38 | "scapy", 39 | "ray", 40 | "ray[default]", 41 | "multiprocess", 42 | "addict", 43 | "config_io==0.4.0", 44 | "flask", 45 | ], # add any additional packages that 46 | # needs to be installed along with your package. Eg: 'caer' 47 | keywords=["python", "netshare"], 48 | classifiers=[ 49 | "Development Status :: 3 - Alpha", 50 | "Intended Audience :: Education", 51 | "Programming Language :: Python :: 3.9", 52 | "Operating System :: MacOS :: MacOS X", 53 | "Operating System :: POSIX :: Linux" 54 | ], 55 | ) 56 | -------------------------------------------------------------------------------- /traces/README.md: -------------------------------------------------------------------------------- 1 | Please download the all the example datasets [here](https://drive.google.com/drive/folders/1FOl1VMr0tXhzKEOupxnJE9YQ2GwfX2FD?usp=sharing). 2 | 3 | --- 4 | 5 | # Dataset description 6 | 7 | Three NetFlow datasets: Netflow data has the following schema TBD 8 | 9 | 1. [UGR16](https://nesg.ugr.es/nesg-ugr16/) dataset consists of traffic (including attacks) from NetFlow v9 collectors in a Spanish ISP network. We used data from the third week of March 2016. 10 | 2. [CIDDS](https://www.hs-coburg.de/forschung/forschungsprojekte-oeffentlich/informationstechnologie/cidds-coburg-intrusion-detection-data-sets.html) dataset emulates a small business environment with several clients and servers (e.g., email, web) with injected malicious traffic was executed. Each NetFlow entry recorded with the label (benign/attack) and attack type (DoS, brute force, port scan). 11 | 3. [TON](https://research.unsw.edu.au/projects/toniot-datasets) dataset represents telemetry IoT sensors. We use a sub-dataset (“Train_Test_datasets”) for evaluating cybersecurity-related ML algorithms; of its 461,013 records, 300,000 (65.07%) are normal, and the rest (34.93%) combine nine evenly-distributed attack types (e.g., backdoor, DDoS, injection, MITM). 12 | 13 | Three PCAP datasets: 14 | 15 | 1. [CAIDA](https://www.caida.org/catalog/datasets/passive_dataset/) contains anonymized traces from high-speed monitors on a commercial backbone link. Our subset is from the New York collector in March 2018. (**Require an CAIDA account to download the data**) 16 | 2. [DC](https://pages.cs.wisc.edu/~tbenson/IMC10_Data.html) dataset is a packet capture from the "UNI1" data center studied in the [IMC 2010 paper](https://pages.cs.wisc.edu/~tbenson/papers/imc192.pdf). 17 | 3. [CA](https://www.netresec.com/?page=MACCDC) dataset is traces from The U.S. National CyberWatch Mid-Atlantic Collegiate Cyber Defense Competitions from March 2012. 18 | 19 | Zeek: Zeek logs have the following schema TBD 20 | 21 | Wikipedia: The wikipedia web page view logs have the following schema TBD 22 | -------------------------------------------------------------------------------- /traces/caida-small/raw.pcap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netsharecmu/NetShare/af026037a88db486069209e2258e11c2df1b93e2/traces/caida-small/raw.pcap -------------------------------------------------------------------------------- /util/README.md: -------------------------------------------------------------------------------- 1 | # Cluster prerequisite 2 | 1. Nodes mounted with nfs. 3 | 2. Nodes can communicate with each other using ssh in normal user mode. 4 | 5 | # Ray cluster setup 6 | 7 | Launch ray cluster (Run this command on your own laptop or on the host in cluster) 8 | 9 | When launching ray cluster on your own laptop or in the cluster, please make sure to activate the conda environment "NetShare". 10 | 11 | 12 | ```bash 13 | # Change the host and workers ip in example.yaml 14 | (NetShare) node1:/nfs/NetShare-dev$ export LC_ALL=C.UTF-8 15 | (NetShare) node1:/nfs/NetShare-dev$ ray up ray/example.yaml 16 | ``` 17 | 18 | If launching the cluster from the cluster returns error like the following `FileNotFoundError: [Errno 2] No such file or directory: '/tmp/ray/cluster-test.lock'` it may be a bug. 19 | 20 | Solution is: 21 | 22 | ```bash 23 | # Change the host and workers ip in example.yaml 24 | (NetShare) node1:/nfs/NetShare-dev$ export LC_ALL=C.UTF-8 25 | (NetShare) node1:/nfs/NetShare-dev$ ray start --head 26 | (NetShare) node1:/nfs/NetShare-dev$ ray stop 27 | (NetShare) node1:/nfs/NetShare-dev$ ray up ray/example.yaml 28 | ``` 29 | 30 | Check if the ray cluster has been launched successfully. 31 | ``` bash 32 | (NetShare) node1:/nfs/NetShare-dev$ ray status 33 | 34 | ======== Autoscaler status: 2022-07-23 10:08:03.979944 ======== 35 | Node status 36 | --------------------------------------------------------------- 37 | Healthy: 38 | 4 local.cluster.node 39 | Pending: 40 | (no pending nodes) 41 | Recent failures: 42 | (no failures) 43 | 44 | Resources 45 | --------------------------------------------------------------- 46 | Usage: 47 | 0.0/160.0 CPU 48 | 0.00/513.323 GiB memory 49 | 0.14/223.987 GiB object_store_memory 50 | 51 | Demands: 52 | (no resource demands) 53 | ``` 54 | 55 | Or 56 | 57 | ``` bash 58 | (NetShare) node1:/nfs/NetShare-dev$ python3 ray/check_nodes.py 59 | 60 | Python version 61 | 3.6.13 |Anaconda, Inc.| (default, Jun 4 2021, 14:25:59) 62 | [GCC 7.5.0] 63 | {'128.105.144.191', '128.105.144.190', '128.105.144.179', '128.105.144.199'} 64 | [{'NodeID': '58433beab7f1653cde1324b9a6764596fb0ef534eaf2182946ef28a4', 'Alive': True, 'NodeManagerAddress': '128.105.144.199', 'NodeManagerHostname': 'node4.env-test.cloudmigration-pg0.wisc.cloudlab.us', 'NodeManagerPort': 42365, 'ObjectManagerPort': 34065, 'ObjectStoreSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/raylet', 'MetricsExportPort': 50427, 'NodeName': '128.105.144.199', 'alive': True, 'Resources': {'object_store_memory': 60156551577.0, 'node:128.105.144.199': 1.0, 'memory': 140365287015.0, 'CPU': 40.0}}, {'NodeID': '315f7a09c9e7633d7e6119730004188116696c069a463472671018c5', 'Alive': True, 'NodeManagerAddress': '128.105.144.191', 'NodeManagerHostname': 'node3.env-test.cloudmigration-pg0.wisc.cloudlab.us', 'NodeManagerPort': 36329, 'ObjectManagerPort': 41259, 'ObjectStoreSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/raylet', 'MetricsExportPort': 58422, 'NodeName': '128.105.144.191', 'alive': True, 'Resources': {'CPU': 40.0, 'memory': 140363113677.0, 'object_store_memory': 60155620147.0, 'node:128.105.144.191': 1.0}}, {'NodeID': '30a870e576b48152b1150ca7d026ad9d51a16377121ad494355e7f76', 'Alive': True, 'NodeManagerAddress': '128.105.144.190', 'NodeManagerHostname': 'node2.env-test.cloudmigration-pg0.wisc.cloudlab.us', 'NodeManagerPort': 35237, 'ObjectManagerPort': 33677, 'ObjectStoreSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/raylet', 'MetricsExportPort': 56875, 'NodeName': '128.105.144.190', 'alive': True, 'Resources': {'object_store_memory': 60154269696.0, 'node:128.105.144.190': 1.0, 'memory': 140359962624.0, 'CPU': 40.0}}, {'NodeID': '3a36f6e72af22d38d74f353ef6daf44a02f25668875b528c462d2f17', 'Alive': True, 'NodeManagerAddress': '128.105.144.179', 'NodeManagerHostname': 'node1.env-test.cloudmigration-pg0.wisc.cloudlab.us', 'NodeManagerPort': 44069, 'ObjectManagerPort': 44719, 'ObjectStoreSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/raylet', 'MetricsExportPort': 65331, 'NodeName': '128.105.144.179', 'alive': True, 'Resources': {'object_store_memory': 60037563187.0, 'node:128.105.144.179': 1.0, 'CPU': 40.0, 'memory': 130087647437.0}}] 65 | ``` 66 | 67 | Check if dashboard has been launched successfully 68 | 69 | dashboard: http://:8265/ -------------------------------------------------------------------------------- /util/grow-rootfs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Ref: https://groups.google.com/g/powder-users/c/QQiQ4uWsFmI/m/4rPTmHt0BAAJ 4 | # HOW TO RUN the script 5 | # env RESIZEROOT=64 ./grow-rootfs.sh 6 | 7 | # 8 | # If user wants to resize the rootfs to the max, try to do that. 9 | # 10 | set -x 11 | 12 | 13 | if [ `id -u` -ne 0 ] ; then 14 | echo "This script must be run as root" 1>&2 15 | exit 1 16 | fi 17 | 18 | if [ -z "$RESIZEROOT" ]; then 19 | echo "ERROR: must define RESIZEROOT to the new total size (GB) you want for the rootfs!" 20 | exit 0 21 | fi 22 | if [ -z "$IMPOTENT" ]; then 23 | IMPOTENT=0 24 | fi 25 | if [ -z "$NODELETE" ]; then 26 | NODELETE=0 27 | fi 28 | 29 | # Remove any existing temp files. 30 | rm -fv /tmp/sfdisk.orig /tmp/sfdisk.new /tmp/sfdisk.new \ 31 | /tmp/sfdisk.nextstart /tmp/sfdisk.parts-to-delete 32 | 33 | # Find the root partition's parent disk. 34 | eval `lsblk -n -P -b -o NAME,FSTYPE,MOUNTPOINT,PARTTYPE,PARTUUID,TYPE,PKNAME,SIZE | grep 'MOUNTPOINT="/"'` 35 | ROOTPARENT=$PKNAME 36 | ROOT=$NAME 37 | if [ -z "$ROOTPARENT" -o -z "$ROOT" ]; then 38 | echo "ERROR: unable to find root device or root parent disk; aborting!" 39 | exit 1 40 | fi 41 | # Find root partition number. 42 | ROOTPARTNO=`echo "$ROOT" | sed -ne "s/^${ROOTPARENT}p\([0-9]*\)$/\1/p"` 43 | if [ ! $? -eq 0 -o -z "$ROOTPARTNO" ]; then 44 | ROOTPARTNO=`echo "$NAME" | sed -ne "s/^${ROOTPARENT}\([0-9]*\)$/\1/p"` 45 | fi 46 | if [ -z "$ROOTPARTNO" ]; then 47 | echo "ERROR: could not determine the root partition number; aborting!" 48 | exit 1 49 | fi 50 | 51 | # Save off the original partition table, and create a new one to modify. 52 | sfdisk -d /dev/$ROOTPARENT > /tmp/sfdisk.orig 53 | cp -p /tmp/sfdisk.orig /tmp/sfdisk.new 54 | 55 | if [ $NODELETE -eq 0 ]; then 56 | # Swapoff all swap devices if we are not impotent; they will be 57 | # removed. 58 | for dev in `blkid -t TYPE=swap | cut -d: -f1 | xargs` ; do 59 | if [ ! $IMPOTENT -eq 1 ]; then 60 | swapoff $dev 61 | if [ ! $? -eq 0 ]; then 62 | echo "ERROR: failed to swapoff $dev; aborting!" 63 | exit 1 64 | fi 65 | fi 66 | done 67 | 68 | # Figure out which partitions to remove. We remove any partition on 69 | # the rootparent with FSTYPE="" and MOUNTPOINT="" and 70 | # PARTUUID=(0fc63daf-8483-4772-8e79-3d69d8477de4|00000000-0000-0000-0000-000000000000|0657FD6D-A4AB-43C4-84E5-0933C84B4F4F|0x83|0x82|0x0). 71 | 72 | PARTS="" 73 | lsblk -a -n -P -b -o NAME,FSTYPE,MOUNTPOINT,PARTTYPE,PARTUUID,TYPE,PKNAME,SIZE | grep "PKNAME=\"${ROOTPARENT}\"" | while read line ; do 74 | eval "$line" 75 | if [ "$FSTYPE" != swap -a \( -n "$FSTYPE" -o -n "$MOUNTPOINT" \) ]; then 76 | continue 77 | fi 78 | echo "$PARTTYPE" | grep -qEi '^(0fc63daf-8483-4772-8e79-3d69d8477de4|00000000-0000-0000-0000-000000000000|0657FD6D-A4AB-43C4-84E5-0933C84B4F4F|0x83|0x82|0x0)$' 79 | if [ ! $? -eq 0 ]; then 80 | continue 81 | fi 82 | # Now extract the partition number (to feed to parted). Partition 83 | # number is not reported by most Linux tools nor by sysfs, so we 84 | # have to extract via regexp. Right now we only worry about nvme 85 | # devices (or any device that ends with a "p\d+"), and assume that 86 | # anything else is "standard". 87 | PARTNO=`echo "$NAME" | sed -ne "s/^${PKNAME}p\([0-9]*\)$/\1/p"` 88 | if [ ! $? -eq 0 -o -z "$PARTNO" ]; then 89 | PARTNO=`echo "$NAME" | sed -ne "s/^${PKNAME}\([0-9]*\)$/\1/p"` 90 | fi 91 | if [ ! $? -eq 0 -o -z "$PARTNO" ]; then 92 | continue 93 | fi 94 | PARTS="$PARTNO $PARTS" 95 | echo $PARTNO >> /tmp/sfdisk.parts-to-delete 96 | done 97 | 98 | if [ -e /tmp/sfdisk.parts-to-delete ]; then 99 | PARTS=`cat /tmp/sfdisk.parts-to-delete | xargs` 100 | rm -f /tmp/sfdisk.tmp 101 | cat /tmp/sfdisk.new | while read line ; do 102 | delete=0 103 | for part in $PARTS ; do 104 | echo "$line" | grep -q "^/dev/${ROOTPARENT}$part :" 105 | if [ $? -eq 0 ]; then 106 | delete=1 107 | break 108 | fi 109 | done 110 | if [ $delete -eq 0 ]; then 111 | echo "$line" >> /tmp/sfdisk.tmp 112 | fi 113 | done 114 | diff -u /tmp/sfdisk.new /tmp/sfdisk.tmp 115 | mv /tmp/sfdisk.tmp /tmp/sfdisk.new 116 | fi 117 | fi 118 | 119 | # 120 | # Now we need to figure out the max sector we can end on. If there is a 121 | # partition further up the disk, we can't stomp it. 122 | # 123 | DISKSIZE=`sfdisk -l /dev/$ROOTPARENT | sed -ne 's/^Disk.*, \([0-9]*\) sectors$/\1/p'` 124 | ROOTSTART=`sfdisk -l -o device,start,end /dev/$ROOTPARENT | sed -ne "s|/dev/${ROOT} *\([0-9]*\) *\([0-9]*\)$|\1|p"` 125 | ROOTEND=`sfdisk -l -o device,start,end /dev/$ROOTPARENT | sed -ne "s|/dev/${ROOT} *\([0-9]*\) *\([0-9]*\)$|\2|p"` 126 | ROOTSIZE=`expr $ROOTEND - $ROOTSTART + 1` 127 | # First, we find the max size of the new root partition in sectors. If 128 | # we find a partition with a start greater than ROOTEND, that value - 129 | # 2048 is the new end. Otherwise, it is DISKSIZE - 2048. 130 | nextstart=$DISKSIZE 131 | cat /tmp/sfdisk.new | grep "^/dev" | while read line ; do 132 | nstart=`echo $line | sed -ne "s|/dev/[^ ]* *: *start= *\([0-9]*\),.*$|\1|p"` 133 | if [ -z "$nstart" ] ; then 134 | continue 135 | fi 136 | if [ $nstart -gt $ROOTSTART -a $nstart -lt $nextstart ]; then 137 | nextstart=$nstart 138 | echo $nextstart > /tmp/sfdisk.nextstart 139 | fi 140 | done 141 | if [ -e /tmp/sfdisk.nextstart -a -s /tmp/sfdisk.nextstart ]; then 142 | nextstart=`cat /tmp/sfdisk.nextstart` 143 | fi 144 | align=0 145 | if [ ! `expr $nextstart \% 2048` -eq 0 ]; then 146 | align=2048 147 | fi 148 | maxsize=`expr $nextstart - $align - $ROOTSTART` 149 | # Sanitize the size. We only support GB. 150 | RESIZEROOT=`echo "$RESIZEROOT" | sed -ne 's/^\([0-9]*\)[^0-9]*$/\1/p'` 151 | if [ -z "$RESIZEROOT" ]; then 152 | echo "ERROR: could not determine size of root disk $ROOTPARENT; aborting!" 153 | exit 1 154 | fi 155 | if [ $RESIZEROOT -eq 0 ]; then 156 | newsize=$maxsize 157 | else 158 | usersectors=`expr $RESIZEROOT \* 1024 \* 1024 \* 1024 / 512` 159 | if [ $usersectors -gt $maxsize ]; then 160 | newsize=$maxsize 161 | else 162 | newsize=$usersectors 163 | fi 164 | fi 165 | if [ -z "$newsize" ]; then 166 | echo "ERROR: failed to calculate new root partition size; aborting!" 167 | exit 1 168 | fi 169 | 170 | 171 | 172 | if [ $newsize -eq $ROOTSIZE ]; then 173 | echo "Nothing to do: newsize ($newsize) same as current root size ($ROOTSIZE)." 174 | exit 0 175 | fi 176 | 177 | if [ $newsize -lt $ROOTSIZE ]; then 178 | echo "ERROR: newsize ($newsize) less than current root size ($ROOTSIZE); aborting!" 179 | exit 1 180 | fi 181 | 182 | 183 | if [ $newsize -lt 2048 ]; then 184 | echo "WARNING: cannot expand root partition; skipping!" 185 | exit 0 186 | fi 187 | 188 | # Finally, edit the sfdisk.new file to change the root device's size. 189 | cat /tmp/sfdisk.new | while read line ; do 190 | echo "$line" | grep -q "^/dev/${ROOT} :" 191 | if [ $? -eq 0 ]; then 192 | echo "$line" | sed -e "s|^\(/dev/${ROOT} :.*\)\(size= *[0-9]*,\)\(.*\)$|\1size=${newsize}\3|" >> /tmp/sfdisk.tmp 193 | else 194 | echo "$line" >> /tmp/sfdisk.tmp 195 | fi 196 | done 197 | mv /tmp/sfdisk.tmp /tmp/sfdisk.new 198 | 199 | diff -u /tmp/sfdisk.orig /tmp/sfdisk.new 200 | 201 | if [ $IMPOTENT -eq 1 ]; then 202 | exit 0 203 | fi 204 | 205 | sfdisk --force /dev/$ROOTPARENT < /tmp/sfdisk.new 206 | partprobe /dev/$ROOTPARENT 207 | resize2fs /dev/$ROOT 208 | if [ ! $? -eq 0 ]; then 209 | echo "ERROR: failed to resize /dev/$ROOT filesystem; aborting!" 210 | exit 1 211 | fi 212 | 213 | echo "Resized /dev/$ROOT." 214 | 215 | exit 0 216 | -------------------------------------------------------------------------------- /util/ray/check_nodes.py: -------------------------------------------------------------------------------- 1 | import ray 2 | import sys 3 | 4 | ray.init(address="auto") 5 | # ray.init(address='ray://128.105.144.254:10001') 6 | 7 | import time 8 | 9 | @ray.remote 10 | def f(): 11 | time.sleep(0.01) 12 | ip = ray._private.services.get_node_ip_address() 13 | # f = open(f"/nfs/ray-test/node{ip}.txt", "a") 14 | # f.write("1") 15 | # f.close() 16 | return ip 17 | 18 | # Get a list of the IP addresses of the nodes that have joined the cluster. 19 | print("Python version") 20 | print(sys.version) 21 | print(set(ray.get([f.remote() for _ in range(100000)]))) 22 | print(ray.nodes()) 23 | -------------------------------------------------------------------------------- /util/ray/example.yaml: -------------------------------------------------------------------------------- 1 | # A unique identifier for the head node and workers of this cluster. 2 | cluster_name: test 3 | 4 | provider: 5 | type: local 6 | head_ip: nfs.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us 7 | # You may need to supply a public ip for the head node if you need 8 | # to run `ray up` from outside of the Ray cluster's network 9 | # (e.g. the cluster is in an AWS VPC and you're starting ray from your laptop) 10 | # This is useful when debugging the local node provider with cloud VMs. 11 | # external_head_ip: YOUR_HEAD_PUBLIC_IP 12 | worker_ips: [node1.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node2.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node3.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node4.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node5.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node6.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node7.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node8.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node9.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node10.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us,node11.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node12.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node13.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node14.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node15.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node16.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node17.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node18.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node19.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node20.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us] 13 | # worker_ips: [10.10.1.2, 10.10.1.3, 10.10.1.4, 10.10.1.5, 10.10.1.6, 10.10.1.7, 10.10.1.8, 10.10.1.9, 10.10.1.10, 10.10.1.11] 14 | # worker_ips: [] 15 | # Optional when running automatic cluster management on prem. If you use a coordinator server, 16 | # then you can launch multiple autoscaling clusters on the same set of machines, and the coordinator 17 | # will assign individual nodes to clusters as needed. 18 | # coordinator_address: ":" 19 | 20 | # How Ray will authenticate with newly launched nodes. 21 | auth: 22 | ssh_user: yyucheng 23 | # You can comment out `ssh_private_key` if the following machines don't need a private key for SSH access to the Ray 24 | # cluster: 25 | # (1) The machine on which `ray up` is executed. 26 | # (2) The head node of the Ray cluster. 27 | # 28 | # The machine that runs ray up executes SSH commands to set up the Ray head node. The Ray head node subsequently 29 | # executes SSH commands to set up the Ray worker nodes. When you run ray up, ssh credentials sitting on the ray up 30 | # machine are copied to the head node -- internally, the ssh key is added to the list of file mounts to rsync to head node. 31 | ssh_private_key: ~/.ssh/id_rsa 32 | 33 | # The minimum number of workers nodes to launch in addition to the head 34 | # node. This number should be >= 0. 35 | # Typically, min_workers == max_workers == len(worker_ips). 36 | # This field is optional. 37 | # min_workers: 1 38 | 39 | # The maximum number of workers nodes to launch in addition to the head node. 40 | # This takes precedence over min_workers. 41 | # Typically, min_workers == max_workers == len(worker_ips). 42 | # This field is optional. 43 | # max_workers: 1 44 | # The default behavior for manually managed clusters is 45 | # min_workers == max_workers == len(worker_ips), 46 | # meaning that Ray is started on all available nodes of the cluster. 47 | # For automatically managed clusters, max_workers is required and min_workers defaults to 0. 48 | 49 | # The autoscaler will scale up the cluster faster with higher upscaling speed. 50 | # E.g., if the task requires adding more nodes then autoscaler will gradually 51 | # scale up the cluster in chunks of upscaling_speed*currently_running_nodes. 52 | # This number should be > 0. 53 | # upscaling_speed: 1.0 54 | 55 | # idle_timeout_minutes: 5 56 | 57 | # Files or directories to copy to the head and worker nodes. The format is a 58 | # dictionary from REMOTE_PATH: LOCAL_PATH. E.g. you could save your conda env to an environment.yaml file, mount 59 | # that directory to all nodes and call `conda -n my_env -f /path1/on/remote/machine/environment.yaml`. In this 60 | # example paths on all nodes must be the same (so that conda can be called always with the same argument) 61 | file_mounts: { 62 | # "/path1/on/remote/machine": "/path1/on/local/machine", 63 | # "/path2/on/remote/machine": "/path2/on/local/machine", 64 | # "~": "../NetShare-dev", 65 | } 66 | 67 | # Files or directories to copy from the head node to the worker nodes. The format is a 68 | # list of paths. The same path on the head node will be copied to the worker node. 69 | # This behavior is a subset of the file_mounts behavior. In the vast majority of cases 70 | # you should just use file_mounts. Only use this if you know what you're doing! 71 | # cluster_synced_files: ["~/test.txt"] 72 | 73 | # Whether changes to directories in file_mounts or cluster_synced_files in the head node 74 | # should sync to the worker node continuously 75 | # file_mounts_sync_continuously: False 76 | 77 | # Patterns for files to exclude when running rsync up or rsync down 78 | # rsync_exclude: 79 | # - "**/.git" 80 | # - "**/.git/**" 81 | 82 | # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for 83 | # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided 84 | # as a value, the behavior will match git's behavior for finding and using .gitignore files. 85 | # rsync_filter: 86 | # - ".gitignore" 87 | 88 | # # List of commands that will be run before `setup_commands`. If docker is 89 | # # enabled, these commands will run outside the container and before docker 90 | # # is setup. 91 | # initialization_commands: [] 92 | 93 | # # List of shell commands to run to set up each nodes. 94 | setup_commands: [conda activate NetShare] 95 | # # If we have e.g. conda dependencies stored in "/path1/on/local/machine/environment.yaml", we can prepare the 96 | # # work environment on each worker by: 97 | # # 1. making sure each worker has access to this file i.e. see the `file_mounts` section 98 | # # 2. adding a command here that creates a new conda environment on each node or if the environment already exists, 99 | # # it updates it: 100 | # # conda env create -q -n my_venv -f /path1/on/local/machine/environment.yaml || conda env update -q -n my_venv -f /path1/on/local/machine/environment.yaml 101 | # # 102 | # # Ray developers: 103 | # # you probably want to create a Docker image that 104 | # # has your Ray repo pre-cloned. Then, you can replace the pip installs 105 | # # below with a git checkout (and possibly a recompile). 106 | # # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image 107 | # # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line: 108 | # # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" 109 | 110 | # # Custom commands that will be run on the head node after common setup. 111 | head_setup_commands: [echo "hello from the head node"] 112 | 113 | # # Custom commands that will be run on worker nodes after common setup. 114 | worker_setup_commands: [echo "hello from worker nodes"] 115 | 116 | # Command to start ray on the head node. You don't need to change this. 117 | head_start_ray_commands: 118 | # If we have e.g. conda dependencies, we could create on each node a conda environment (see `setup_commands` section). 119 | # In that case we'd have to activate that env on each node before running `ray`: 120 | - conda activate NetShare && export LC_ALL=C.UTF-8 && ray stop 121 | - conda activate NetShare && export LC_ALL=C.UTF-8 && ulimit -c unlimited && ray start --head --port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml --include-dashboard=True --dashboard-host=0.0.0.0 --dashboard-port=8265 --temp-dir=/users/yyucheng/tmp 122 | # --system-config="{\"kill_idle_workers_interval_ms\":10}" 123 | # - ray stop 124 | # - ulimit -c unlimited && ray start --head --port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml 125 | 126 | # Command to start ray on worker nodes. You don't need to change this. 127 | worker_start_ray_commands: 128 | # If we have e.g. conda dependencies, we could create on each node a conda environment (see `setup_commands` section). 129 | # In that case we'd have to activate that env on each node before running `ray`: 130 | - conda activate NetShare && export LC_ALL=C.UTF-8 && ray stop 131 | - conda activate NetShare && export LC_ALL=C.UTF-8 && ray start --address=$RAY_HEAD_IP:6379 132 | # - ray stop 133 | # - ray start --address=$RAY_HEAD_IP:6379 -------------------------------------------------------------------------------- /util/setup-cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd $HOME 3 | 4 | VIRTUAL_ENV=$1 5 | USERNAME=$2 6 | CONDA_EXEC=$HOME/anaconda3/bin/conda 7 | NETSHARE_LOCAL_REPO=/nfs/NetShare 8 | 9 | # Anaconda3 10 | if [ -f $CONDA_EXEC ] 11 | then 12 | echo "Anaconda3 installed." 13 | else 14 | echo "Anaconda3 not installed. Start installation now..." 15 | wget https://repo.anaconda.com/archive/Anaconda3-2022.05-Linux-x86_64.sh 16 | bash Anaconda3-2022.05-Linux-x86_64.sh -b -p $HOME/anaconda3 17 | fi 18 | eval "$($HOME/anaconda3/bin/conda shell.bash hook)" 19 | conda init 20 | 21 | # create virtual environment if not exists 22 | if ! { conda env list | grep $VIRTUAL_ENV; } >/dev/null 2>&1 23 | then 24 | echo "Conda environment $VIRTUAL_ENV not installed." 25 | conda create -y --name $VIRTUAL_ENV python=3.6 26 | else 27 | echo "Conda environment $VIRTUAL_ENV installed." 28 | fi 29 | source $HOME/anaconda3/etc/profile.d/conda.sh 30 | conda activate $VIRTUAL_ENV 31 | 32 | # If already cloned 33 | if ! [ -d $NETSHARE_LOCAL_REPO] 34 | then 35 | echo "git clone from remote repo..." 36 | git clone https://github.com/netsharecmu/NetShare.git $NETSHARE_LOCAL_REPO 37 | else 38 | echo "$NETSHARE_LOCAL_REPO exists! Skip git clone..." 39 | fi 40 | 41 | export LC_ALL=C.UTF-8 42 | export LANG=C.UTF-8 43 | ray start --head && ray stop 44 | 45 | cd $NETSHARE_LOCAL_REPO 46 | pip3 install -e . -------------------------------------------------------------------------------- /util/setup_node_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | USER=yyucheng 4 | NUMHOSTS=21 5 | EXPERIMENTNAME=netshare-wisc 6 | PROJECTNAME=cloudmigration-pg0 7 | # LOCATION=utah 8 | LOCATION=wisc 9 | # LOCATION=clemson 10 | SITE=cloudlab.us 11 | 12 | pids=() 13 | 14 | # setup controller 15 | NODE_SYSTEM="${USER}@nfs.${EXPERIMENTNAME}.${PROJECTNAME}.${LOCATION}.${SITE}" 16 | # NODE_SYSTEM="${USER}@nfs.${EXPERIMENTNAME}.cloudmigration.emulab.net" 17 | echo $NODE_SYSTEM 18 | ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $NODE_SYSTEM "sudo -n env RESIZEROOT=192 bash -s" < grow-rootfs.sh 19 | ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $NODE_SYSTEM "bash -s" < setup-cpu.sh "NetShare" & 20 | scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ~/.ssh/netshare-package $NODE_SYSTEM:~/.ssh/id_rsa & 21 | pids+=($!) 22 | 23 | # setup workers 24 | COUNTER=1 25 | while [ $COUNTER -lt $NUMHOSTS ]; do 26 | NODE="node${COUNTER}" 27 | NODE_SYSTEM="${USER}@${NODE}.${EXPERIMENTNAME}.${PROJECTNAME}.${LOCATION}.${SITE}" 28 | # NODE_SYSTEM="${USER}@${NODE}.${EXPERIMENTNAME}.cloudmigration.emulab.net" 29 | echo $NODE_SYSTEM 30 | 31 | ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $NODE_SYSTEM "sudo -n env RESIZEROOT=192 bash -s" < grow-rootfs.sh 32 | ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $NODE_SYSTEM "bash -s" < setup-cpu.sh "NetShare" & 33 | scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ~/.ssh/netshare-package $NODE_SYSTEM:~/.ssh/id_rsa & 34 | 35 | pids+=($!) 36 | let COUNTER=COUNTER+1 37 | done 38 | 39 | for pid in "${pids[@]}"; do 40 | wait "$pid" 41 | done --------------------------------------------------------------------------------