├── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── README.md
    ├── netflow
    │   ├── config_example_netflow_nodp.json
    │   └── driver.py
    └── pcap
    │   ├── config_example_pcap_nodp.json
    │   └── driver.py
├── netshare
    ├── __init__.py
    ├── configs
    │   └── default
    │   │   ├── __init__.py
    │   │   ├── dg_table_row_per_sample.json
    │   │   └── single_event_per_row.json
    ├── generators
    │   ├── __init__.py
    │   └── generator.py
    ├── model_managers
    │   ├── __init__.py
    │   ├── dg_model_manager.py
    │   ├── model_manager.py
    │   └── netshare_manager
    │   │   ├── generate_helper.py
    │   │   ├── netshare_manager.py
    │   │   ├── netshare_util.py
    │   │   └── train_helper.py
    ├── models
    │   ├── __init__.py
    │   ├── doppelganger_torch
    │   │   ├── __init__.py
    │   │   ├── doppelganger.py
    │   │   ├── load_data.py
    │   │   ├── network.py
    │   │   ├── privacy_util.py
    │   │   └── util.py
    │   ├── doppelganger_torch_model.py
    │   └── model.py
    ├── pre_post_processors
    │   ├── __init__.py
    │   ├── dg_row_per_sample_pre_post_processor.py
    │   ├── netshare
    │   │   ├── README.md
    │   │   ├── choose_best_model.py
    │   │   ├── denormalize_fields.py
    │   │   ├── dist_metrics.py
    │   │   ├── embedding_helper.py
    │   │   ├── main.c
    │   │   ├── netshare_pre_post_processor.py
    │   │   ├── packet.h
    │   │   ├── preprocess_helper.py
    │   │   ├── sharedlib.sh
    │   │   ├── util.py
    │   │   └── word2vec_embedding.py
    │   └── pre_post_processor.py
    ├── ray
    │   ├── __init__.py
    │   ├── config.py
    │   ├── ray_functions.py
    │   └── remote.py
    └── utils
    │   ├── __init__.py
    │   ├── exec_cmd.py
    │   ├── field.py
    │   ├── logger.py
    │   ├── output.py
    │   └── tee.py
├── setup.py
├── traces
    ├── README.md
    ├── caida-small
    │   └── raw.pcap
    └── ugr16-small
    │   └── raw.csv
└── util
    ├── README.md
    ├── grow-rootfs.sh
    ├── ray
        ├── check_nodes.py
        └── example.yaml
    ├── setup-cpu.sh
    └── setup_node_parallel.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | __pycache__/
  2 | *.py[cod]
  3 | .DS_Store
  4 | .vscode/
  5 | placeholder
  6 | data
  7 | debug
  8 | 
  9 | netshare/dashboard/static/tmp
 10 | 
 11 | tests/
 12 | traces/
 13 | results/
 14 | rsync*.sh
 15 | *.pkl
 16 | *.ini
 17 | 
 18 | # Byte-compiled / optimized / DLL files
 19 | __pycache__/
 20 | *.py[cod]
 21 | *$py.class
 22 | 
 23 | # C extensions
 24 | *.so
 25 | *.o
 26 | 
 27 | # Distribution / packaging
 28 | .Python
 29 | build/
 30 | develop-eggs/
 31 | dist/
 32 | downloads/
 33 | eggs/
 34 | .eggs/
 35 | lib/
 36 | lib64/
 37 | parts/
 38 | sdist/
 39 | var/
 40 | wheels/
 41 | pip-wheel-metadata/
 42 | share/python-wheels/
 43 | *.egg-info/
 44 | .installed.cfg
 45 | *.egg
 46 | MANIFEST
 47 | 
 48 | # PyInstaller
 49 | #  Usually these files are written by a python script from a template
 50 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 51 | *.manifest
 52 | *.spec
 53 | 
 54 | # Installer logs
 55 | pip-log.txt
 56 | pip-delete-this-directory.txt
 57 | 
 58 | # Unit test / coverage reports
 59 | htmlcov/
 60 | .tox/
 61 | .nox/
 62 | .coverage
 63 | .coverage.*
 64 | .cache
 65 | nosetests.xml
 66 | coverage.xml
 67 | *.cover
 68 | *.py,cover
 69 | .hypothesis/
 70 | .pytest_cache/
 71 | 
 72 | # Translations
 73 | *.mo
 74 | *.pot
 75 | 
 76 | # Django stuff:
 77 | *.log
 78 | local_settings.py
 79 | db.sqlite3
 80 | db.sqlite3-journal
 81 | 
 82 | # Flask stuff:
 83 | instance/
 84 | .webassets-cache
 85 | 
 86 | # Scrapy stuff:
 87 | .scrapy
 88 | 
 89 | # Sphinx documentation
 90 | docs/_build/
 91 | 
 92 | # PyBuilder
 93 | target/
 94 | 
 95 | # Jupyter Notebook
 96 | .ipynb_checkpoints
 97 | 
 98 | # IPython
 99 | profile_default/
100 | ipython_config.py
101 | 
102 | # pyenv
103 | .python-version
104 | 
105 | # pipenv
106 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
107 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
108 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
109 | #   install all needed dependencies.
110 | #Pipfile.lock
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The Clear BSD License
 2 | 
 3 | Copyright (c) 2022 Carnegie Mellon University
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without modification, are permitted (subject to the limitations in the disclaimer below) provided that the following conditions are met:
 7 |     * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 8 |     * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 9 |     * Neither the name of Carnegie Mellon University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 |     
11 | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Practical GAN-based Synthetic IP Header Trace Generation using NetShare
  2 | 
  3 | [[paper (SIGCOMM 2022)](https://dl.acm.org/doi/abs/10.1145/3544216.3544251)]
  4 | [[talk (SIGCOMM 2022)](https://www.youtube.com/watch?v=mWnFIncjtWg)]
  5 | [[talk (ZeekWeek 2022)]](https://www.youtube.com/watch?v=MN_fa-FBOHg)
  6 | [[talk (FloCon 2023)]](https://resources.sei.cmu.edu/library/asset-view.cfm?assetid=890917)
  7 | [[web service demo](https://drive.google.com/file/d/1vPuneEb14A2w7fKyCJ41NAHzsvpLQP5H/view)]
  8 | 
  9 | **Authors:** 
 10 | [[Yucheng Yin](https://sniperyyc.com/)]
 11 | [[Zinan Lin](http://www.andrew.cmu.edu/user/zinanl/)]
 12 | [[Minhao Jin](https://www.linkedin.com/in/minhao-jin-1328b8164/)]
 13 | [[Giulia Fanti](https://www.andrew.cmu.edu/user/gfanti/)]
 14 | [[Vyas Sekar](https://users.ece.cmu.edu/~vsekar/)]
 15 | 
 16 | **Abstract:** We explore the feasibility of using Generative Adversarial Networks (GANs) to automatically learn generative models to generate synthetic packet- and flow header traces for network-ing tasks (e.g., telemetry, anomaly detection, provisioning). We identify key fidelity, scalability, and privacy challenges and tradeoffs in existing GAN-based approaches. By synthesizing domain-specific insights with recent advances in machine learning and privacy, we identify design choices to tackle these challenges. Building on these insights, we develop an end-to-end framework, NetShare. We evaluate NetShare on six diverse packet header traces and find that: (1) across distributional metrics and traces, it achieves 46% more accuracy than baselines, and (2) it meets users’ requirements of downstream tasks in evaluating accuracy and rank ordering of candidate approaches.
 17 | 
 18 | # News
 19 | [2023.04] Woohoo! New version released with a list of new features:
 20 | - Bump Python version to 3.9
 21 | - Replace tensorflow 1.15 with torch
 22 | - Support generic dataset formats
 23 | - Add [SDMetrics](https://github.com/netsharecmu/SDMetrics_timeseries/tree/master/sdmetrics) for hyperparameter/model selection and data visualization
 24 | 
 25 | [2022.08]: The deprecated [`camera-ready`](https://github.com/netsharecmu/NetShare/releases/tag/camera-ready-deprecated) branch holds the scripts we used to run all the experiments in the [paper](https://dl.acm.org/doi/abs/10.1145/3544216.3544251).
 26 | 
 27 | # Users
 28 | NetShare has been used by several independent users/companies.
 29 | 
 30 | - [Purdue CS536 Fall 2022 Class project](https://github.com/annuszulfiqar2021/NetShare)
 31 | - [Rockfish Data](https://rockfish.ai/index.html)
 32 | 
 33 | # Datasets
 34 | ***We are adding more datasets! Feel free to add your own and contribute!***
 35 | 
 36 | Our paper uses **six** public datasets for reproducibility. Please download the six datasets [here](https://drive.google.com/drive/folders/1FOl1VMr0tXhzKEOupxnJE9YQ2GwfX2FD?usp=sharing) and put them under `traces/`.
 37 | 
 38 | You may also refer to the [README](traces/README.md) for detailed descriptions of the datasets.
 39 | 
 40 | 
 41 | # Setup
 42 | ## Step 0: Install `libpcap` depdency (Optional)
 43 | If you are working with PCAP files and you have not installed `libpcap`,
 44 | - On MacOS, install using `homebrew`:
 45 |   ```Bash
 46 |   brew install libpcap
 47 |   ```
 48 | - On Debian-based system (e.g., Ubuntu), install using `apt`:
 49 |   ```Bash
 50 |   sudo apt install libpcap-dev
 51 |   ```
 52 | 
 53 | ## Step 1: Install NetShare Python package (Required)
 54 | We recommend installing NetShare in a virtual environment (e.g., Anaconda3). We test with virtual environment with Python==3.9.
 55 | 
 56 | ```Bash
 57 | # Assume Anaconda is installed
 58 | # Create virtual environment if not exists
 59 | conda create --name NetShare python=3.9
 60 | 
 61 | # Activate virtual env
 62 | conda activate NetShare
 63 | 
 64 | # Install NetShare package
 65 | git clone https://github.com/netsharecmu/NetShare.git
 66 | pip3 install -e NetShare/
 67 | 
 68 | # Install SDMetrics package
 69 | git clone https://github.com/netsharecmu/SDMetrics_timeseries
 70 | pip3 install -e SDMetrics_timeseries/
 71 | ```
 72 | 
 73 | ## Step 2: How to start Ray? (Optional but **strongly** recommended)
 74 | Ray is a unified framework for scaling AI and Python applications. Our framework utilizes Ray to increase parallelism and distribute workloads among the cluster automatically and efficiently.
 75 | 
 76 | ### Laptop/Single-machine (only recommended for demo/dev/fun)
 77 | ```
 78 | ray start --head --port=6379 --include-dashboard=True --dashboard-host=0.0.0.0 --dashboard-port=8265
 79 | ```
 80 | 
 81 | Please go to [http://localhost:8265](http://localhost:8265) to view the Ray dashboard.
 82 | 
 83 | 
 84 | ### Multi-machines (**strongly** recommended for faster training/generation)
 85 | We provide a utility script and [README](util/README.md) under `util/` for setting up a Ray cluster. As a reference, we are using [Cloudlab](https://www.cloudlab.us/) which is referred as ``custom cluster'' in the Ray documentation. If you are using a different cluster (e.g., AWS, GCP, Azure), please refer to the [Ray doc](https://docs.ray.io/en/releases-2.0.0rc0/cluster/cloud.html#cluster-cloud) for full reference.
 86 | 
 87 | 
 88 | 
 89 | # Example usage
 90 | ***We are adding more examples of usage (PCAP, NetFlow, w/ and w/o DP). Please stay tuned!***
 91 | 
 92 | Here is a minimal working example to generate synthetic netflow files without differential privacy. Please change your working directory to  `examples/<sub_example>` by `cd examples/<sub_example>`. 
 93 | 
 94 | You may refer to [`examples`](examples/) for more scripts and config files. 
 95 | 
 96 | [Driver code](examples/netflow/driver.py)
 97 | ```Python
 98 | import random
 99 | import netshare.ray as ray
100 | from netshare import Generator
101 | 
102 | if __name__ == '__main__':
103 |     # Change to False if you would not like to use Ray
104 |     ray.config.enabled = False
105 |     ray.init(address="auto")
106 | 
107 |     # configuration file
108 |     generator = Generator(config="config_example_netflow_nodp.json")
109 | 
110 |     # `work_folder` should not exist o/w an overwrite error will be thrown.
111 |     # Please set the `worker_folder` as *absolute path*
112 |     # if you are using Ray with multi-machine setup
113 |     # since Ray has bugs when dealing with relative paths.
114 |     generator.train(work_folder=f'../../results/test-ugr16')
115 |     generator.generate(work_folder=f'../../results/test-ugr16')
116 |     generator.visualize(work_folder=f'../../results/test-ugr16')
117 | 
118 |     ray.shutdown()
119 | ```
120 | 
121 | The corresponding [configuration file](examples/netflow/config_example_netflow_nodp.json).
122 | You may refer to [README](netshare/configs/README.md) for detailed explanations of the configuration files.
123 | 
124 | After generation, you will be redirected to a dashboard where a side-to-side visual comparison between real and synthetic data will be shown.
125 | 
126 | # Codebase structure (for *dev* purpose)
127 | ```
128 | ├── doc                       # (tentative) NetShare tutorials and APIs
129 | ├── examples                  # Examples of using NetShare on different datasets
130 | ├── netshare                  # NetShare source code
131 | │   ├── configs               # Default configurations  
132 | │   ├── generators            # Generator class
133 | │   ├── model_managers        # Core of NetShare service (i.e, train/generate)
134 | │   ├── models                # Timeseries GAN models (e.g., DoppelGANger)
135 | │   ├── pre_post_processors   # Pre- and post-process data
136 | │   ├── ray                   # Ray functions overloading
137 | │   └── utils                 # Utility functions/common class definitions
138 | ├── traces                    # Traces/datasets
139 | └── util                      # MISC/setup scripts
140 |     └── ray                   # Ray setup script
141 | ```
142 | 
143 | 
144 | # References
145 | Please cite our paper/codebase approriately if you find NetShare is useful.
146 | 
147 | ```bibtex
148 | @inproceedings{netshare-sigcomm2022,
149 |   author = {Yin, Yucheng and Lin, Zinan and Jin, Minhao and Fanti, Giulia and Sekar, Vyas},
150 |   title = {Practical GAN-Based Synthetic IP Header Trace Generation Using NetShare},
151 |   year = {2022},
152 |   isbn = {9781450394208},
153 |   publisher = {Association for Computing Machinery},
154 |   address = {New York, NY, USA},
155 |   url = {https://doi.org/10.1145/3544216.3544251},
156 |   doi = {10.1145/3544216.3544251},
157 |   abstract = {We explore the feasibility of using Generative Adversarial Networks (GANs) to automatically learn generative models to generate synthetic packet- and flow header traces for networking tasks (e.g., telemetry, anomaly detection, provisioning). We identify key fidelity, scalability, and privacy challenges and tradeoffs in existing GAN-based approaches. By synthesizing domain-specific insights with recent advances in machine learning and privacy, we identify design choices to tackle these challenges. Building on these insights, we develop an end-to-end framework, NetShare. We evaluate NetShare on six diverse packet header traces and find that: (1) across all distributional metrics and traces, it achieves 46% more accuracy than baselines and (2) it meets users' requirements of downstream tasks in evaluating accuracy and rank ordering of candidate approaches.},
158 |   booktitle = {Proceedings of the ACM SIGCOMM 2022 Conference},
159 |   pages = {458–472},
160 |   numpages = {15},
161 |   keywords = {privacy, synthetic data generation, network packets, network flows, generative adversarial networks},
162 |   location = {Amsterdam, Netherlands},
163 |   series = {SIGCOMM '22}
164 | }
165 | ```
166 | 
167 | Part of the source code is adapated from the following open-source projects:
168 | 
169 | - [DoppelGANger](https://github.com/fjxmlzn/DoppelGANger)
170 | - [GPUTaskScheduler](https://github.com/fjxmlzn/GPUTaskScheduler)
171 | - [BSN](https://github.com/fjxmlzn/BSN)
172 | - [Ray](https://github.com/ray-project/ray)
173 | - [config_io](https://github.com/fjxmlzn/config_io)
174 | - [SDMetrics](https://github.com/sdv-dev/SDMetrics)
175 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | We support multiple common data schemas and here are a few examples with corresponding configuration files. You may find the "nearest match" to start with.
 2 | 
 3 | **Note: across all examples, `iteration` are set to a small number to ensure a quick E2E test. For generating high-quality synthetic data, we recommend increasing `iteration` by your experience and computational resources.**
 4 | 
 5 | # Prerequiste
 6 | We support four different fields:
 7 | 1. Bit field (encoded as bit strings) e.g., 
 8 |     ```JSON
 9 |     {
10 |         "column": "srcip",
11 |         "type": "integer",
12 |         "encoding": "bit",
13 |         "n_bits": 32
14 |     }
15 |     ```
16 |    An optional property to this field is `truncate`, which is a boolean value with default `False`. If `truncate` is set to `true`, then we will truncate large integers and consider only the most significant `n_bits` bits. 
17 | 
18 | 2. Word2Vec field (encoded as Word2Vec vectors), e.g.,
19 |     ```JSON
20 |     {
21 |         "column": "srcport",
22 |         "type": "integer",
23 |         "encoding": "word2vec_port"
24 |     }
25 |     ```
26 | 3. Categorical field (encoded as one-hot encoding), e.g., 
27 |     ```JSON
28 |     {
29 |         "column": "type",
30 |         "type": "string",
31 |         "encoding": "categorical"
32 |     }
33 |     ```
34 | 4. Continuous field, e.g.,
35 |     ```JSON
36 |     {
37 |         "column": "pkt",
38 |         "type": "float",
39 |         "normalization": "ZERO_ONE",
40 |         "log1p_norm": true
41 |     }
42 |     ```
43 | 
44 | # Dataset type 1: single-event
45 | Single-event schema contains one timeseries per row.
46 | 
47 | ## Data schema
48 | | Timestamp (optional) | Metadata 1 | Metadata 2 | ... | Timeseries 1 | Timeseries 2 | ... |
49 | |:--------------------:|:----------:|:----------:|:---:|:-------------:|:-------------:|:---:|
50 | |          t1          |            |            |     |               |               |     |
51 | |          t2          |            |            |     |               |               |     |
52 | |          ...         |            |            |     |               |               |     |
53 | 
54 | ## Examples
55 | 1. PCAP
56 |     | Timestamp | Srcip | Dstip | Srcport | Dstport | Proto | Pkt_size | ... |
57 |     |:---------:|:-----:|:-----:|:-------:|:-------:|:-----:|:--------:|:---:|
58 |     |     t1    |       |       |         |         |       |          |     |
59 |     |     t2    |       |       |         |         |       |          |     |
60 |     |    ...    |       |       |         |         |       |          |     |
61 | 
62 | 2. NetFlow ([configuration_file](netflow/config_example_netflow_nodp.json))
63 | 
64 | <!-- 3. [HAR dataset](https://www.kaggle.com/datasets/malekzadeh/motionsense-dataset) ([configuration_file]())
65 |     |   | attitude.roll | attitude.pitch | attitude.yaw | userAcceleration.x | userAcceleration.y | userAcceleration.z | act | id  | weight | height | age  | gender | trial |
66 |     |---|---------------|----------------|--------------|--------------------|--------------------|--------------------|-----|-----|--------|--------|------|--------|-------|
67 |     | 0 | 1.528132      | -0.733896      | 0.696372     | 0.294894           | -0.184493          | 0.377542           | 0.0 | 0.0 | 102.0  | 188.0  | 46.0 | 1.0    | 1.0   |
68 |     | 1 | 1.527992      | -0.716987      | 0.677762     | 0.219405           | 0.035846           | 0.114866           | 0.0 | 0.0 | 102.0  | 188.0  | 46.0 | 1.0    | 1.0   |
69 |     | 2 | 1.527765      | -0.706999      | 0.670951     | 0.010714           | 0.134701           | -0.167808          | 0.0 | 0.0 | 102.0  | 188.0  | 46.0 | 1.0    | 1.0   | -->
70 | 
71 | 
72 | 
73 | # [Dataset type 2: multi-event](./dg_table_row_per_sample/README.md)
74 | Multi-event data schema contains multiple timeseries per row.
75 | 
76 | ## Data Schema
77 | | Metadata 1 | Metadata 2 | ... | {Timestamp (optional), Timeseries 1, Timeseries 2, ...} | {Timestamp (optional), Timeseries 1, Timeseries 2, ...} | ... |
78 | |:----------:|:----------:|:---:|:-------------------------------------------------------:|:-------------------------------------------------------:|:---:|
79 | |            |            |     |                                                         |                                                         |     |
80 | |            |            |     |                                                         |                                                         |     |
81 | 
82 | ## Examples
83 | 1. Wikipedia dataset ([configuration_file](./dg_table_row_per_sample/config_example_wiki.json))
84 |     | Domain | Access type | Agent | {Date 1, page view} | {Date 2, page view} | ... |
85 |     |:------:|:-----------:|:-----:|:-------------------:|:-------------------:|:---:|
86 |     |        |             |       |                     |                     |     |
87 |     |        |             |       |                     |                     |     |
88 | 
89 | 


--------------------------------------------------------------------------------
/examples/netflow/config_example_netflow_nodp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "global_config": {
 3 |         "original_data_file": "../../traces/ugr16-small/raw.csv",
 4 |         "overwrite": true,
 5 |         "dataset_type": "netflow",
 6 |         "n_chunks": 2,
 7 |         "dp": false
 8 |     },
 9 |     "default": "single_event_per_row.json",
10 |     "pre_post_processor": {
11 |         "class": "NetsharePrePostProcessor",
12 |         "config": {
13 |             "timestamp": {
14 |                 "column": "ts",
15 |                 "generation": true,
16 |                 "encoding": "interarrival",
17 |                 "normalization": "ZERO_ONE"
18 |             },
19 |             "word2vec": {
20 |                 "vec_size": 10,
21 |                 "model_name": "word2vec_vecSize",
22 |                 "annoy_n_trees": 100,
23 |                 "pretrain_model_path": null
24 |             },
25 |             "metadata": [
26 |                 {
27 |                     "column": "srcip",
28 |                     "type": "integer",
29 |                     "encoding": "bit",
30 |                     "n_bits": 32,
31 |                     "categorical_mapping": false
32 |                 },
33 |                 {
34 |                     "column": "dstip",
35 |                     "type": "integer",
36 |                     "encoding": "bit",
37 |                     "n_bits": 32,
38 |                     "categorical_mapping": false
39 |                 },
40 |                 {
41 |                     "column": "srcport",
42 |                     "type": "integer",
43 |                     "encoding": "word2vec_port"
44 |                 },
45 |                 {
46 |                     "column": "dstport",
47 |                     "type": "integer",
48 |                     "encoding": "word2vec_port"
49 |                 },
50 |                 {
51 |                     "column": "proto",
52 |                     "type": "string",
53 |                     "encoding": "word2vec_proto"
54 |                 }
55 |             ],
56 |             "timeseries": [
57 |                 {
58 |                     "column": "td",
59 |                     "type": "float",
60 |                     "normalization": "ZERO_ONE",
61 |                     "log1p_norm": true
62 |                 },
63 |                 {
64 |                     "column": "pkt",
65 |                     "type": "float",
66 |                     "normalization": "ZERO_ONE",
67 |                     "log1p_norm": true
68 |                 },
69 |                 {
70 |                     "column": "byt",
71 |                     "type": "float",
72 |                     "normalization": "ZERO_ONE",
73 |                     "log1p_norm": true
74 |                 },
75 |                 {
76 |                     "column": "type",
77 |                     "type": "string",
78 |                     "encoding": "categorical"
79 |                 }
80 |             ]
81 |         }
82 |     },
83 |     "model": {
84 |         "class": "DoppelGANgerTorchModel",
85 |         "config": {
86 |             "batch_size": 100,
87 |             "sample_len": [
88 |                 1,
89 |                 5,
90 |                 10
91 |             ],
92 |             "sample_len_expand": true,
93 |             "epochs": 40,
94 |             "extra_checkpoint_freq": 1,
95 |             "epoch_checkpoint_freq": 5
96 |         }
97 |     }
98 | }


--------------------------------------------------------------------------------
/examples/netflow/driver.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import netshare.ray as ray
 3 | from netshare import Generator
 4 | 
 5 | if __name__ == '__main__':
 6 |     # Change to False if you would not like to use Ray
 7 |     ray.config.enabled = False
 8 |     ray.init(address="auto")
 9 | 
10 |     # configuration file
11 |     generator = Generator(config="config_example_netflow_nodp.json")
12 | 
13 |     # `work_folder` should not exist o/w an overwrite error will be thrown.
14 |     # Please set the `worker_folder` as *absolute path*
15 |     # if you are using Ray with multi-machine setup
16 |     # since Ray has bugs when dealing with relative paths.
17 |     generator.train(work_folder=f'../../results/test-ugr16')
18 |     generator.generate(work_folder=f'../../results/test-ugr16')
19 |     generator.visualize(work_folder=f'../../results/test-ugr16')
20 | 
21 |     ray.shutdown()
22 | 


--------------------------------------------------------------------------------
/examples/pcap/config_example_pcap_nodp.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "global_config": {
  3 |         "original_data_file": "../../traces/caida-small/raw.pcap",
  4 |         "overwrite": true,
  5 |         "dataset_type": "pcap",
  6 |         "n_chunks": 1,
  7 |         "dp": false
  8 |     },
  9 |     "default": "single_event_per_row.json",
 10 |     "pre_post_processor": {
 11 |         "class": "NetsharePrePostProcessor",
 12 |         "config": {
 13 |             "timestamp": {
 14 |                 "column": "time",
 15 |                 "generation": true,
 16 |                 "encoding": "interarrival",
 17 |                 "normalization": "ZERO_ONE"
 18 |             },
 19 |             "word2vec": {
 20 |                 "vec_size": 10,
 21 |                 "model_name": "word2vec_vecSize",
 22 |                 "annoy_n_trees": 100,
 23 |                 "pretrain_model_path": null
 24 |             },
 25 |             "metadata": [
 26 |                 {
 27 |                     "column": "srcip",
 28 |                     "type": "integer",
 29 |                     "encoding": "bit",
 30 |                     "n_bits": 32,
 31 |                     "categorical_mapping": false
 32 |                 },
 33 |                 {
 34 |                     "column": "dstip",
 35 |                     "type": "integer",
 36 |                     "encoding": "bit",
 37 |                     "n_bits": 32,
 38 |                     "categorical_mapping": false
 39 |                 },
 40 |                 {
 41 |                     "column": "srcport",
 42 |                     "type": "integer",
 43 |                     "encoding": "word2vec_port"
 44 |                 },
 45 |                 {
 46 |                     "column": "dstport",
 47 |                     "type": "integer",
 48 |                     "encoding": "word2vec_port"
 49 |                 },
 50 |                 {
 51 |                     "column": "proto",
 52 |                     "type": "string",
 53 |                     "encoding": "word2vec_proto"
 54 |                 }
 55 |             ],
 56 |             "timeseries": [
 57 |                 {
 58 |                     "column": "pkt_len",
 59 |                     "type": "float",
 60 |                     "normalization": "ZERO_ONE"
 61 |                 },
 62 |                 {
 63 |                     "column": "tos",
 64 |                     "type": "float",
 65 |                     "normalization": "ZERO_ONE",
 66 |                     "min_x": 0.0,
 67 |                     "max_x": 255.0
 68 |                 },
 69 |                 {
 70 |                     "column": "id",
 71 |                     "type": "float",
 72 |                     "normalization": "ZERO_ONE",
 73 |                     "min_x": 0.0,
 74 |                     "max_x": 65535.0
 75 |                 },
 76 |                 {
 77 |                     "column": "flag",
 78 |                     "type": "integer",
 79 |                     "encoding": "categorical",
 80 |                     "choices": [
 81 |                         0,
 82 |                         1,
 83 |                         2
 84 |                     ]
 85 |                 },
 86 |                 {
 87 |                     "column": "off",
 88 |                     "type": "float",
 89 |                     "normalization": "ZERO_ONE",
 90 |                     "min_x": 0.0,
 91 |                     "max_x": 8191.0
 92 |                 },
 93 |                 {
 94 |                     "column": "ttl",
 95 |                     "type": "float",
 96 |                     "normalization": "ZERO_ONE",
 97 |                     "min_x": 0.0,
 98 |                     "max_x": 255.0
 99 |                 }
100 |             ]
101 |         }
102 |     },
103 |     "model": {
104 |         "class": "DoppelGANgerTorchModel",
105 |         "config": {
106 |             "batch_size": 100,
107 |             "sample_len": [
108 |                 10
109 |             ],
110 |             "sample_len_expand": true,
111 |             "epochs": 40,
112 |             "extra_checkpoint_freq": 1,
113 |             "epoch_checkpoint_freq": 5
114 |         }
115 |     }
116 | }


--------------------------------------------------------------------------------
/examples/pcap/driver.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import netshare.ray as ray
 3 | from netshare import Generator
 4 | 
 5 | if __name__ == '__main__':
 6 |     # Change to False if you would not like to use Ray
 7 |     ray.config.enabled = False
 8 |     ray.init(address="auto")
 9 | 
10 |     # configuration file
11 |     generator = Generator(config="config_example_pcap_nodp.json")
12 | 
13 |     # `work_folder` should not exist o/w an overwrite error will be thrown.
14 |     # Please set the `worker_folder` as *absolute path*
15 |     # if you are using Ray with multi-machine setup
16 |     # since Ray has bugs when dealing with relative paths.
17 |     generator.train(work_folder='../../results/test-caida')
18 |     generator.generate(work_folder='../../results/test-caida')
19 |     generator.visualize(work_folder='../../results/test-caida')
20 | 
21 |     ray.shutdown()
22 | 


--------------------------------------------------------------------------------
/netshare/__init__.py:
--------------------------------------------------------------------------------
1 | from .generators.generator import Generator
2 | 
3 | __all__ = ['Generator']
4 | 


--------------------------------------------------------------------------------
/netshare/configs/default/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netsharecmu/NetShare/af026037a88db486069209e2258e11c2df1b93e2/netshare/configs/default/__init__.py


--------------------------------------------------------------------------------
/netshare/configs/default/dg_table_row_per_sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "global_config": {
 3 |         "overwrite": false,
 4 |         "original_data_file": "<file_path>"
 5 |     },
 6 |     "pre_post_processor": {
 7 |         "class": "DGRowPerSamplePrePostProcessor",
 8 |         "config": {
 9 |             "num_train_samples": 50000,
10 |             "num_test_samples": 50000,
11 |             "metadata": [
12 |             ],
13 |             "timeseries": [
14 |             ]
15 |         }
16 |     },
17 |     "model_manager": {
18 |         "class": "DGModelManager",
19 |         "config": {
20 |         }
21 |     },
22 |     "model": {
23 |         "class": "DoppelGANgerTFModel",
24 |         "config": {
25 |             "batch_size": 100,
26 |             "sample_len": 10,
27 |             "iteration": 200000,
28 |             "vis_freq": 100000,
29 |             "vis_num_sample": 5,
30 |             "d_rounds": 1,
31 |             "g_rounds": 1,
32 |             "num_packing": 1,
33 |             "noise": true,
34 |             "attr_noise_type": "normal",
35 |             "feature_noise_type": "normal",
36 |             "rnn_mlp_num_layers": 0,
37 |             "feed_back": false,
38 |             "g_lr": 0.001,
39 |             "d_lr": 0.001,
40 |             "d_gp_coe": 10.0,
41 |             "gen_feature_num_layers": 1,
42 |             "gen_feature_num_units": 100,
43 |             "gen_attribute_num_layers": 3,
44 |             "gen_attribute_num_units": 100,
45 |             "disc_num_layers": 5,
46 |             "disc_num_units": 200,
47 |             "initial_state": "random",
48 |             "leaky_relu": false,
49 |             "attr_d_lr": 0.001,
50 |             "attr_d_gp_coe": 10.0,
51 |             "g_attr_d_coe": 1.0,
52 |             "attr_disc_num_layers": 5,
53 |             "attr_disc_num_units": 200,
54 |             "aux_disc": true,
55 |             "self_norm": true,
56 |             "fix_feature_network": false,
57 |             "debug": false,
58 |             "combined_disc": true,
59 |             "use_gt_lengths": false,
60 |             "use_uniform_lengths": false,
61 |             "num_cores": null,
62 |             "sn_mode": null,
63 |             "scale": 1.0,
64 |             "extra_checkpoint_freq": 20000,
65 |             "epoch_checkpoint_freq": 1000,
66 |             "dp_noise_multiplier": null,
67 |             "dp_l2_norm_clip": null
68 |         }
69 |     }
70 | }


--------------------------------------------------------------------------------
/netshare/configs/default/single_event_per_row.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "global_config": {
  3 |         "overwrite": false,
  4 |         "original_data_file": "traces/1M/ugr16/raw.csv",
  5 |         "dataset_type": "netflow",
  6 |         "n_chunks": 10,
  7 |         "dp": false,
  8 |         "allowed_data_types": [
  9 |             "ip_string",
 10 |             "integer",
 11 |             "float",
 12 |             "string"
 13 |         ],
 14 |         "allowed_data_encodings": [
 15 |             "categorical",
 16 |             "bit",
 17 |             "word2vec_port",
 18 |             "word2vec_proto"
 19 |         ]
 20 |     },
 21 |     "pre_post_processor": {
 22 |         "class": "NetsharePrePostProcessor",
 23 |         "config": {
 24 |             "max_flow_len": null,
 25 |             "norm_option": 0,
 26 |             "split_name": "multichunk_dep_v2",
 27 |             "df2chunks": "fixed_time",
 28 |             "truncate": "per_chunk"
 29 |         }
 30 |     },
 31 |     "model_manager": {
 32 |         "class": "NetShareManager",
 33 |         "config": {
 34 |             "pretrain_dir": null,
 35 |             "skip_chunk0_train": false,
 36 |             "pretrain_non_dp": true,
 37 |             "pretrain_non_dp_reduce_time": 4.0,
 38 |             "pretrain_dp": false,
 39 |             "run": 0
 40 |         }
 41 |     },
 42 |     "model": {
 43 |         "class": "DoppelGANgerTorchModel",
 44 |         "config": {
 45 |             "batch_size": 100,
 46 |             "sample_len": [
 47 |                 1,
 48 |                 5,
 49 |                 10
 50 |             ],
 51 |             "sample_len_expand": true,
 52 |             "iteration": 200000,
 53 |             "vis_freq": 100000,
 54 |             "vis_num_sample": 5,
 55 |             "d_rounds": 5,
 56 |             "g_rounds": 1,
 57 |             "num_packing": 1,
 58 |             "noise": true,
 59 |             "attr_noise_type": "normal",
 60 |             "feature_noise_type": "normal",
 61 |             "rnn_mlp_num_layers": 0,
 62 |             "feed_back": false,
 63 |             "g_lr": 0.0001,
 64 |             "g_beta1": 0.5,
 65 |             "d_lr": 0.0001,
 66 |             "d_beta1": 0.5,
 67 |             "d_gp_coe": 10.0,
 68 |             "adam_eps": 1e-8,
 69 |             "adam_amsgrad": false,
 70 |             "generator_feature_num_layers": 1,
 71 |             "generator_feature_num_units": 100,
 72 |             "generator_attribute_num_layers": 5,
 73 |             "generator_attribute_num_units": 512,
 74 |             "discriminator_num_layers": 5,
 75 |             "discriminator_num_units": 512,
 76 |             "initial_state": "random",
 77 |             "leaky_relu": false,
 78 |             "attr_d_lr": 0.0001,
 79 |             "attr_d_beta1": 0.5,
 80 |             "attr_d_gp_coe": 10.0,
 81 |             "g_attr_d_coe": 1.0,
 82 |             "attr_discriminator_num_layers": 5,
 83 |             "attr_discriminator_num_units": 512,
 84 |             "use_attr_discriminator": true,
 85 |             "self_norm": false,
 86 |             "fix_feature_network": false,
 87 |             "debug": false,
 88 |             "combined_disc": true,
 89 |             "use_gt_lengths": false,
 90 |             "use_uniform_lengths": false,
 91 |             "num_cores": null,
 92 |             "sn_mode": null,
 93 |             "scale": 1.0,
 94 |             "extra_checkpoint_freq": 20000,
 95 |             "epoch_checkpoint_freq": 1000,
 96 |             "dp_noise_multiplier": null,
 97 |             "dp_l2_norm_clip": null,
 98 |             "use_adaptive_rolling": false,
 99 |             "attribute_latent_dim": 5,
100 |             "feature_latent_dim": 5
101 |         }
102 |     }
103 | }


--------------------------------------------------------------------------------
/netshare/generators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netsharecmu/NetShare/af026037a88db486069209e2258e11c2df1b93e2/netshare/generators/__init__.py


--------------------------------------------------------------------------------
/netshare/generators/generator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import copy
  4 | import warnings
  5 | import pandas as pd
  6 | 
  7 | import netshare.pre_post_processors as pre_post_processors
  8 | import netshare.model_managers as model_managers
  9 | import netshare.models as models
 10 | from ..pre_post_processors.netshare.util import create_sdmetrics_config
 11 | 
 12 | from config_io import Config
 13 | from sdmetrics.reports.timeseries import QualityReport
 14 | from ..configs import default as default_configs
 15 | 
 16 | 
 17 | class Generator(object):
 18 |     def __init__(self, config):
 19 |         self._config = Config.load_from_file(
 20 |             config,
 21 |             default_search_paths=default_configs.__path__)
 22 |         config = copy.deepcopy(self._config)
 23 | 
 24 |         global_config = self._config["global_config"]
 25 | 
 26 |         if 'original_data_folder' in global_config and \
 27 |                 'file_extension' not in global_config:
 28 |             raise ValueError('Input is a folder. '
 29 |                              'Intended file extensions must be specified with '
 30 |                              '`file_extension=<.ext>` (e.g., `.pcap`, `.csv`')
 31 | 
 32 |         if 'original_data_folder' in global_config and \
 33 |                 "original_data_file" in global_config:
 34 |             raise ValueError(
 35 |                 'Input can be either a single file or \
 36 |                     a folder (with multiple valid files)!')
 37 |         self._ori_data_path = global_config['original_data_folder'] \
 38 |             if 'original_data_folder' in global_config \
 39 |             else global_config['original_data_file']
 40 |         self._overwrite = global_config['overwrite']
 41 | 
 42 |         pre_post_processor_class = getattr(
 43 |             pre_post_processors, config['pre_post_processor']['class'])
 44 |         pre_post_processor_config = Config(global_config)
 45 |         pre_post_processor_config.update(
 46 |             config['pre_post_processor']['config'])
 47 |         self._pre_post_processor = pre_post_processor_class(
 48 |             config=pre_post_processor_config)
 49 | 
 50 |         model_manager_class = getattr(
 51 |             model_managers, config['model_manager']['class'])
 52 |         model_manager_config = Config(global_config)
 53 |         model_manager_config.update(config['model_manager']['config'])
 54 |         self._model_manager = model_manager_class(config=model_manager_config)
 55 | 
 56 |         model_class = getattr(models, config['model']['class'])
 57 |         model_config = config['model']['config']
 58 |         self._model = model_class
 59 |         self._model_config = model_config
 60 | 
 61 |     def _get_pre_processed_data_folder(self, work_folder):
 62 |         return os.path.join(work_folder, 'pre_processed_data')
 63 | 
 64 |     def _get_post_processed_data_folder(self, work_folder):
 65 |         return os.path.join(work_folder, 'post_processed_data')
 66 | 
 67 |     def _get_generated_data_folder(self, work_folder):
 68 |         return os.path.join(work_folder, 'generated_data')
 69 | 
 70 |     def _get_model_folder(self, work_folder):
 71 |         return os.path.join(work_folder, 'models')
 72 | 
 73 |     def _get_visualization_folder(self, work_folder):
 74 |         return os.path.join(work_folder, "visulization")
 75 | 
 76 |     def _get_pre_processed_data_log_folder(self, work_folder):
 77 |         return os.path.join(work_folder, 'logs', 'pre_processed_data')
 78 | 
 79 |     def _get_post_processed_data_log_folder(self, work_folder):
 80 |         return os.path.join(work_folder, 'logs', 'post_processed_data')
 81 | 
 82 |     def _get_generated_data_log_folder(self, work_folder):
 83 |         return os.path.join(work_folder, 'logs', 'generated_data')
 84 | 
 85 |     def _get_model_log_folder(self, work_folder):
 86 |         return os.path.join(work_folder, 'logs', 'models')
 87 | 
 88 |     def _pre_process(self, input_folder, output_folder, log_folder):
 89 |         if not self._check_folder(output_folder):
 90 |             return False
 91 |         if not self._check_folder(log_folder):
 92 |             return False
 93 |         return self._pre_post_processor.pre_process(
 94 |             input_folder=input_folder,
 95 |             output_folder=output_folder,
 96 |             log_folder=log_folder)
 97 | 
 98 |     def _post_process(self, input_folder, output_folder,
 99 |                       pre_processed_data_folder, log_folder):
100 |         if not self._check_folder(output_folder):
101 |             return False
102 |         if not self._check_folder(log_folder):
103 |             return False
104 |         return self._pre_post_processor.post_process(
105 |             input_folder=input_folder,
106 |             output_folder=output_folder,
107 |             pre_processed_data_folder=pre_processed_data_folder,
108 |             log_folder=log_folder)
109 | 
110 |     def _train(self, input_train_data_folder, output_model_folder, log_folder):
111 |         if not self._check_folder(output_model_folder):
112 |             return False
113 |         if not self._check_folder(log_folder):
114 |             return False
115 |         return self._model_manager.train(
116 |             input_train_data_folder=input_train_data_folder,
117 |             output_model_folder=output_model_folder,
118 |             log_folder=log_folder,
119 |             create_new_model=self._model,
120 |             model_config=self._model_config)
121 | 
122 |     def _generate(self, input_train_data_folder,
123 |                   input_model_folder, output_syn_data_folder, log_folder):
124 |         if not self._check_folder(output_syn_data_folder):
125 |             return False
126 |         if not self._check_folder(log_folder):
127 |             return False
128 |         return self._model_manager.generate(
129 |             input_train_data_folder=input_train_data_folder,
130 |             input_model_folder=input_model_folder,
131 |             output_syn_data_folder=output_syn_data_folder,
132 |             log_folder=log_folder,
133 |             create_new_model=self._model,
134 |             model_config=self._model_config)
135 | 
136 |     def _check_folder(self, folder):
137 |         if os.path.exists(folder):
138 |             if self._overwrite:
139 |                 warnings.warn(
140 |                     f'{folder} already exists. '
141 |                     'You are overwriting the results.')
142 |                 return True
143 |             else:
144 |                 print(
145 |                     f'{folder} already exists. To avoid overwriting the '
146 |                     'results, please change the work_folder')
147 |                 return False
148 |             return False
149 |         os.makedirs(folder)
150 |         return True
151 | 
152 |     def generate(self, work_folder):
153 |         work_folder = os.path.expanduser(work_folder)
154 |         if not self._generate(
155 |                 input_train_data_folder=self._get_pre_processed_data_folder(
156 |                     work_folder),
157 |                 input_model_folder=self._get_model_folder(work_folder),
158 |                 output_syn_data_folder=self._get_generated_data_folder(
159 |                     work_folder),
160 |                 log_folder=self._get_generated_data_log_folder(work_folder)):
161 |             print('Failed to generate synthetic data')
162 |             return False
163 |         if not self._post_process(
164 |                 input_folder=self._get_generated_data_folder(work_folder),
165 |                 output_folder=self._get_post_processed_data_folder(
166 |                     work_folder),
167 |                 log_folder=self._get_post_processed_data_log_folder(
168 |                     work_folder),
169 |                 pre_processed_data_folder=self._get_pre_processed_data_folder(
170 |                     work_folder)):
171 |             print('Failed to post-process data')
172 |             return False
173 |         print(f'Generated data is at '
174 |               f'{self._get_post_processed_data_folder(work_folder)}')
175 |         return True
176 | 
177 |     def train(self, work_folder):
178 |         work_folder = os.path.expanduser(work_folder)
179 |         if not self._pre_process(
180 |                 input_folder=self._ori_data_path,
181 |                 output_folder=self._get_pre_processed_data_folder(work_folder),
182 |                 log_folder=self._get_pre_processed_data_log_folder(
183 |                     work_folder)):
184 |             print('Failed to pre-process data')
185 |             return False
186 |         if not self._train(
187 |                 input_train_data_folder=self._get_pre_processed_data_folder(
188 |                     work_folder),
189 |                 output_model_folder=self._get_model_folder(work_folder),
190 |                 log_folder=self._get_model_log_folder(work_folder)):
191 |             print('Failed to train the model')
192 |             return False
193 |         return True
194 | 
195 |     def train_and_generate(self, work_folder):
196 |         work_folder = os.path.expanduser(work_folder)
197 |         if not self.train(work_folder):
198 |             return False
199 |         if not self.generate(work_folder):
200 |             return False
201 |         return True
202 | 
203 |     def visualize(self, work_folder):
204 |         work_folder = os.path.expanduser(work_folder)
205 |         os.makedirs(self._get_visualization_folder(work_folder), exist_ok=True)
206 |         real_data = pd.read_csv(
207 |             os.path.join(
208 |                 self._get_pre_processed_data_folder(work_folder), "raw.csv"))
209 |         # Find synthetic data with the largest ID
210 |         syn_data_list = [
211 |             f
212 |             for f in os.listdir(
213 |                 self._get_post_processed_data_folder(work_folder))
214 |             if f.endswith('.csv')]
215 |         id_pattern = re.compile(r'id-(\d+).csv')
216 |         ids = [int(id_pattern.search(filename).group(1))
217 |                for filename in syn_data_list if id_pattern.search(filename)]
218 |         # Find the largest ID
219 |         largest_id = max(ids)
220 |         # Find the filename corresponding to the largest ID
221 |         filename_with_largest_id = [
222 |             filename for filename in syn_data_list
223 |             if f'id-{largest_id}.csv' in filename][0]
224 |         print(
225 |             f'The filename with the largest ID is: {filename_with_largest_id}')
226 |         synthetic_data = pd.read_csv(os.path.join(
227 |             self._get_post_processed_data_folder(work_folder),
228 |             filename_with_largest_id
229 |         ))
230 | 
231 |         # Visualize the real data and synthetic data
232 |         pre_post_processor_config = Config(self._config["global_config"])
233 |         pre_post_processor_config.update(
234 |             self._config['pre_post_processor']['config'])
235 |         sdmetrics_config = create_sdmetrics_config(
236 |             pre_post_processor_config,
237 |             comparison_type='both'
238 |         )
239 |         my_report = QualityReport(
240 |             config_dict=sdmetrics_config['config'])
241 |         my_report.generate(real_data[synthetic_data.columns], synthetic_data,
242 |                            sdmetrics_config['metadata'])
243 |         my_report.visualize()
244 | 


--------------------------------------------------------------------------------
/netshare/model_managers/__init__.py:
--------------------------------------------------------------------------------
1 | from .model_manager import ModelManager
2 | from .netshare_manager.netshare_manager import NetShareManager
3 | from .dg_model_manager import DGModelManager
4 | 
5 | __all__ = ['ModelManager', 'NetShareManager', 'DGModelManager']
6 | 


--------------------------------------------------------------------------------
/netshare/model_managers/dg_model_manager.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import netshare.ray as ray
 3 | 
 4 | from .model_manager import ModelManager
 5 | 
 6 | 
 7 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1)
 8 | def _train_model(create_new_model, config, input_train_data_folder,
 9 |                  output_model_folder, log_folder):
10 |     model = create_new_model(config)
11 |     model.train(
12 |         input_train_data_folder=input_train_data_folder,
13 |         output_model_folder=output_model_folder,
14 |         log_folder=log_folder)
15 |     return True
16 | 
17 | 
18 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1)
19 | def _generate_data(create_new_model, config, input_train_data_folder,
20 |                    input_model_folder, output_syn_data_folder, log_folder):
21 |     config["given_data_attribute_flag"] = False
22 |     config["save_without_chunk"] = True
23 |     model = create_new_model(config)
24 |     model.generate(
25 |         input_train_data_folder=input_train_data_folder,
26 |         input_model_folder=input_model_folder,
27 |         output_syn_data_folder=output_syn_data_folder,
28 |         log_folder=log_folder)
29 |     return True
30 | 
31 | 
32 | class DGModelManager(ModelManager):
33 | 
34 |     def _train(self, input_train_data_folder, output_model_folder, log_folder,
35 |                create_new_model, model_config):
36 |         print(f"{self.__class__.__name__}.{inspect.stack()[0][3]}")
37 |         ray.get(_train_model.remote(
38 |             create_new_model=create_new_model,
39 |             config=model_config,
40 |             input_train_data_folder=input_train_data_folder,
41 |             output_model_folder=output_model_folder,
42 |             log_folder=log_folder))
43 |         return True
44 | 
45 |     def _generate(self, input_train_data_folder, input_model_folder,
46 |                   output_syn_data_folder, log_folder, create_new_model,
47 |                   model_config):
48 |         print(f"{self.__class__.__name__}.{inspect.stack()[0][3]}")
49 |         ray.get(_generate_data.remote(
50 |             create_new_model=create_new_model,
51 |             config=model_config,
52 |             input_train_data_folder=input_train_data_folder,
53 |             input_model_folder=input_model_folder,
54 |             output_syn_data_folder=output_syn_data_folder,
55 |             log_folder=log_folder))
56 |         return True
57 | 


--------------------------------------------------------------------------------
/netshare/model_managers/model_manager.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | import os
 3 | 
 4 | from netshare.utils import Tee
 5 | 
 6 | 
 7 | class ModelManager(ABC):
 8 |     def __init__(self, config):
 9 |         self._config = config
10 | 
11 |     @abstractmethod
12 |     def _train(self, input_train_data_folder, output_model_folder,
13 |                log_folder, create_new_model, model_config):
14 |         ...
15 | 
16 |     @abstractmethod
17 |     def _generate(self,
18 |                   input_train_data_folder, input_model_folder,
19 |                   output_syn_data_folder, log_folder,
20 |                   create_new_model, model_config):
21 |         ...
22 | 
23 |     def train(self, input_train_data_folder, output_model_folder, log_folder,
24 |               create_new_model, model_config):
25 |         stdout_log_path = os.path.join(log_folder, 'train.stdout.log')
26 |         stderr_log_path = os.path.join(log_folder, 'train.stderr.log')
27 |         with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path):
28 |             return self._train(
29 |                 input_train_data_folder=input_train_data_folder,
30 |                 output_model_folder=output_model_folder,
31 |                 log_folder=log_folder,
32 |                 create_new_model=create_new_model,
33 |                 model_config=model_config)
34 | 
35 |     def generate(self,
36 |                  input_train_data_folder, input_model_folder,
37 |                  output_syn_data_folder, log_folder,
38 |                  create_new_model, model_config):
39 |         stdout_log_path = os.path.join(log_folder, 'generate.stdout.log')
40 |         stderr_log_path = os.path.join(log_folder, 'generate.stderr.log')
41 |         with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path):
42 |             return self._generate(
43 |                 input_train_data_folder=input_train_data_folder,
44 |                 input_model_folder=input_model_folder,
45 |                 output_syn_data_folder=output_syn_data_folder,
46 |                 log_folder=log_folder,
47 |                 create_new_model=create_new_model,
48 |                 model_config=model_config)
49 | 


--------------------------------------------------------------------------------
/netshare/model_managers/netshare_manager/generate_helper.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import subprocess
  3 | import sys
  4 | import time
  5 | import os
  6 | import json
  7 | import importlib
  8 | import random
  9 | import pickle
 10 | import pandas as pd
 11 | import socket
 12 | import struct
 13 | import ipaddress
 14 | import argparse
 15 | 
 16 | import numpy as np
 17 | import pandas as pd
 18 | 
 19 | import netshare.ray as ray
 20 | from pathlib import Path
 21 | from tqdm import tqdm
 22 | from scapy.all import IP, ICMP, TCP, UDP
 23 | from scapy.all import wrpcap
 24 | from scipy.stats import rankdata
 25 | from pathlib import Path
 26 | 
 27 | 
 28 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1)
 29 | def _generate_session(
 30 |         create_new_model,
 31 |         configs,
 32 |         config_idx,
 33 |         log_folder):
 34 |     config = configs[config_idx]
 35 |     config["given_data_attribute_flag"] = False
 36 |     model = create_new_model(config)
 37 |     model.generate(
 38 |         input_train_data_folder=config["dataset"],
 39 |         input_model_folder=config["result_folder"],
 40 |         output_syn_data_folder=config["eval_root_folder"],
 41 |         log_folder=log_folder)
 42 | 
 43 | 
 44 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1)
 45 | def _generate_attr(
 46 |         create_new_model,
 47 |         configs,
 48 |         config_idx,
 49 |         log_folder):
 50 |     config = configs[config_idx]
 51 |     config["given_data_attribute_flag"] = False
 52 |     model = create_new_model(config)
 53 |     model.generate(
 54 |         input_train_data_folder=config["dataset"],
 55 |         input_model_folder=config["result_folder"],
 56 |         output_syn_data_folder=config["eval_root_folder"],
 57 |         log_folder=log_folder)
 58 | 
 59 | 
 60 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1)
 61 | def _merge_attr(
 62 |     attr_raw_npz_folder,
 63 |     config_group,
 64 |     configs
 65 | ):
 66 |     num_chunks = len(config_group["config_ids"])
 67 |     chunk0_idx = config_group["config_ids"][0]
 68 |     chunk0_config = configs[chunk0_idx]
 69 |     print("chunk0 config:", configs[chunk0_idx])
 70 | 
 71 |     # Find flow tag starting point
 72 |     with open(os.path.join(chunk0_config["dataset"], "data_attribute_fields.pkl"), 'rb') as f:
 73 |         data_attribute_fields = pickle.load(f)
 74 |     bit_idx_flagstart = 0
 75 |     for field_idx, field in enumerate(data_attribute_fields):
 76 |         if field.name != "startFromThisChunk":
 77 |             bit_idx_flagstart += field.dim_x
 78 |         else:
 79 |             break
 80 |     print("bit_idx_flagstart:", bit_idx_flagstart)
 81 | 
 82 |     attr_clean_npz_folder = os.path.join(
 83 |         str(Path(attr_raw_npz_folder).parents[0]), "attr_clean"
 84 |     )
 85 |     os.makedirs(attr_clean_npz_folder, exist_ok=True)
 86 | 
 87 |     dict_chunkid_attr = {}
 88 |     dict_chunkid_attr_discrete = {}
 89 |     for chunkid in tqdm(range(num_chunks)):
 90 |         dict_chunkid_attr[chunkid] = []
 91 |         dict_chunkid_attr_discrete[chunkid] = []
 92 | 
 93 |     for chunkid in tqdm(range(num_chunks)):
 94 |         n_flows_startFromThisEpoch = 0
 95 | 
 96 |         if not os.path.exists(
 97 |             os.path.join(
 98 |                 attr_raw_npz_folder,
 99 |                 "chunk_id-{}.npz".format(chunkid))
100 |         ):
101 |             print(
102 |                 "{} not exists...".format(
103 |                     os.path.join(
104 |                         attr_raw_npz_folder,
105 |                         "chunk_id-{}.npz".format(chunkid))
106 |                 )
107 |             )
108 |             continue
109 | 
110 |         raw_attr_chunk = np.load(
111 |             os.path.join(
112 |                 attr_raw_npz_folder,
113 |                 "chunk_id-{}.npz".format(chunkid))
114 |         )["data_attribute"]
115 |         raw_attr_discrete_chunk = np.load(
116 |             os.path.join(
117 |                 attr_raw_npz_folder,
118 |                 "chunk_id-{}.npz".format(chunkid))
119 |         )["data_attribute_discrete"]
120 | 
121 |         if num_chunks > 1:
122 |             for row_idx, row in enumerate(raw_attr_chunk):
123 |                 # if row[bit_idx_flagstart] < row[bit_idx_flagstart+1]:
124 |                 if (
125 |                     row[bit_idx_flagstart] < row[bit_idx_flagstart + 1]
126 |                     and row[bit_idx_flagstart + 2 * chunkid + 2]
127 |                     < row[bit_idx_flagstart + 2 * chunkid + 3]
128 |                 ):
129 |                     # this chunk
130 |                     row_this_chunk = list(
131 |                         copy.deepcopy(row)[
132 |                             :bit_idx_flagstart])
133 |                     row_this_chunk += [0.0, 1.0]
134 |                     row_this_chunk += [1.0, 0.0] * (chunkid + 1)
135 |                     for i in range(chunkid + 1, num_chunks):
136 |                         if (
137 |                             row[bit_idx_flagstart + 2 * i + 2]
138 |                             < row[bit_idx_flagstart + 2 * i + 3]
139 |                         ):
140 |                             row_this_chunk += [0.0, 1.0]
141 |                         else:
142 |                             row_this_chunk += [1.0, 0.0]
143 |                     # dict_chunkid_attr[chunkid].append(row_this_chunk)
144 |                     dict_chunkid_attr[chunkid].append(row)
145 |                     dict_chunkid_attr_discrete[chunkid].append(
146 |                         raw_attr_discrete_chunk[row_idx])
147 | 
148 |                     # following chunks
149 |                     # row_following_chunk = list(copy.deepcopy(row)[:bit_idx_flagstart])
150 |                     # row_following_chunk += [1.0, 0.0]*(1+NUM_CHUNKS)
151 |                     n_flows_startFromThisEpoch += 1
152 |                     row_following_chunk = list(copy.deepcopy(row))
153 |                     row_following_chunk[bit_idx_flagstart] = 1.0
154 |                     row_following_chunk[bit_idx_flagstart + 1] = 0.0
155 | 
156 |                     row_discrete_following_chunk = list(
157 |                         copy.deepcopy(raw_attr_discrete_chunk[row_idx]))
158 |                     row_discrete_following_chunk[bit_idx_flagstart] = 1.0
159 |                     row_discrete_following_chunk[bit_idx_flagstart + 1] = 0.0
160 | 
161 |                     for i in range(chunkid + 1, num_chunks):
162 |                         if (
163 |                             row[bit_idx_flagstart + 2 * i + 2]
164 |                             < row[bit_idx_flagstart + 2 * i + 3]
165 |                         ):
166 |                             dict_chunkid_attr[i].append(row_following_chunk)
167 |                             dict_chunkid_attr_discrete[i].append(
168 |                                 row_discrete_following_chunk)
169 |                             # dict_chunkid_attr[i].append(row)
170 |         else:
171 |             dict_chunkid_attr[chunkid] = raw_attr_chunk
172 |             dict_chunkid_attr_discrete[chunkid] = raw_attr_discrete_chunk
173 | 
174 |         print(
175 |             "n_flows_startFromThisEpoch / total flows: {}/{}".format(
176 |                 n_flows_startFromThisEpoch, raw_attr_chunk.shape[0]
177 |             )
178 |         )
179 | 
180 |     print("Saving merged attrs...")
181 |     n_merged_attrs = 0
182 |     for chunkid, attr_clean in dict_chunkid_attr.items():
183 |         print("chunk {}: {} flows".format(chunkid, len(attr_clean)))
184 |         n_merged_attrs += len(attr_clean)
185 |         np.savez(
186 |             os.path.join(
187 |                 attr_clean_npz_folder, "chunk_id-{}.npz".format(chunkid)),
188 |             data_attribute=np.asarray(attr_clean),
189 |             data_attribute_discrete=np.asarray(
190 |                 dict_chunkid_attr_discrete[chunkid]))
191 | 
192 |     print("n_merged_attrs:", n_merged_attrs)
193 | 
194 | 
195 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1)
196 | # @ray.remote(scheduling_strategy="DEFAULT", max_calls=1)
197 | def _generate_given_attr(create_new_model, configs, config_idx,
198 |                          log_folder):
199 | 
200 |     config = configs[config_idx]
201 |     config["given_data_attribute_flag"] = True
202 |     model = create_new_model(config)
203 |     model.generate(
204 |         input_train_data_folder=config["dataset"],
205 |         input_model_folder=config["result_folder"],
206 |         output_syn_data_folder=config["eval_root_folder"],
207 |         log_folder=log_folder)
208 | 


--------------------------------------------------------------------------------
/netshare/model_managers/netshare_manager/netshare_manager.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | 
  3 | from ..model_manager import ModelManager
  4 | from .train_helper import _train_specific_config_group
  5 | from .generate_helper import _generate_attr, _merge_attr, _generate_given_attr, _generate_session
  6 | from .netshare_util import _load_config, _configs2configsgroup
  7 | import netshare.ray as ray
  8 | import os
  9 | import time
 10 | import json
 11 | 
 12 | import pandas as pd
 13 | 
 14 | 
 15 | class NetShareManager(ModelManager):
 16 |     def _train(self, input_train_data_folder, output_model_folder, log_folder,
 17 |                create_new_model, model_config):
 18 |         print(f"{self.__class__.__name__}.{inspect.stack()[0][3]}")
 19 | 
 20 |         configs = _load_config(
 21 |             config_dict={
 22 |                 **self._config,
 23 |                 **model_config},
 24 |             input_train_data_folder=input_train_data_folder,
 25 |             output_model_folder=output_model_folder)
 26 | 
 27 |         configs, config_group_list = _configs2configsgroup(
 28 |             configs=configs,
 29 |             generation_flag=False)
 30 |         print(config_group_list)
 31 |         with open(os.path.join(output_model_folder, "configs_train.json"), 'w') as f:
 32 |             json.dump({
 33 |                 "configs": configs,
 34 |                 "config_group_list": config_group_list
 35 |             }, f, indent=4)
 36 | 
 37 |         objs = []
 38 |         for config_group_id, config_group in enumerate(config_group_list):
 39 |             objs.append(
 40 |                 _train_specific_config_group.remote(
 41 |                     create_new_model=create_new_model,
 42 |                     config_group_id=config_group_id,
 43 |                     config_group=config_group,
 44 |                     configs=configs,
 45 |                     input_train_data_folder=input_train_data_folder,
 46 |                     output_model_folder=output_model_folder,
 47 |                     log_folder=log_folder)
 48 |             )
 49 |         results = ray.get(objs)
 50 |         return results
 51 | 
 52 |     def _generate(
 53 |             self, input_train_data_folder, input_model_folder,
 54 |             output_syn_data_folder, log_folder, create_new_model, model_config):
 55 |         configs = _load_config(
 56 |             config_dict={
 57 |                 **self._config,
 58 |                 **model_config},
 59 |             input_train_data_folder=input_train_data_folder,
 60 |             output_model_folder=input_model_folder)
 61 | 
 62 |         configs, config_group_list = _configs2configsgroup(
 63 |             configs=configs,
 64 |             generation_flag=True,
 65 |             output_syn_data_folder=output_syn_data_folder
 66 |         )
 67 | 
 68 |         with open(os.path.join(output_syn_data_folder, "configs_generate.json"), 'w') as f:
 69 |             json.dump({
 70 |                 "configs": configs,
 71 |                 "config_group_list": config_group_list
 72 |             }, f, indent=4)
 73 | 
 74 |         print("Start generating attributes ...")
 75 |         if configs[0]["n_chunks"] > 1:
 76 |             objs = []
 77 |             for config_idx, config in enumerate(configs):
 78 |                 objs.append(
 79 |                     _generate_attr.remote(
 80 |                         create_new_model=create_new_model,
 81 |                         configs=configs,
 82 |                         config_idx=config_idx,
 83 |                         log_folder=log_folder))
 84 |             _ = ray.get(objs)
 85 |             time.sleep(10)
 86 |             print("Finish generating attributes")
 87 | 
 88 |             print("Start merging attributes ...")
 89 |             objs = []
 90 |             for config_group in config_group_list:
 91 |                 chunk0_idx = config_group["config_ids"][0]
 92 |                 eval_root_folder = configs[chunk0_idx]["eval_root_folder"]
 93 | 
 94 |                 objs.append(
 95 |                     _merge_attr.remote(
 96 |                         attr_raw_npz_folder=os.path.join(
 97 |                             eval_root_folder, "attr_raw"),
 98 |                         config_group=config_group,
 99 |                         configs=configs)
100 |                 )
101 |             _ = ray.get(objs)
102 |             time.sleep(10)
103 |             print("Finish merging attributes...")
104 | 
105 |             print("Start generating features given attributes ...")
106 |             objs = []
107 |             for config_idx, config in enumerate(configs):
108 |                 objs.append(
109 |                     _generate_given_attr.remote(
110 |                         create_new_model=create_new_model,
111 |                         configs=configs,
112 |                         config_idx=config_idx,
113 |                         log_folder=log_folder))
114 |             _ = ray.get(objs)
115 |             time.sleep(10)
116 |         else:
117 |             objs = []
118 |             for config_idx, config in enumerate(configs):
119 |                 objs.append(
120 |                     _generate_session.remote(
121 |                         create_new_model=create_new_model,
122 |                         configs=configs,
123 |                         config_idx=config_idx,
124 |                         log_folder=log_folder))
125 |             _ = ray.get(objs)
126 |         print("Finish generating features given attributes ...")
127 | 
128 |         return True
129 | 


--------------------------------------------------------------------------------
/netshare/model_managers/netshare_manager/netshare_util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | from config_io import Config
  5 | 
  6 | 
  7 | def _load_config(config_dict, input_train_data_folder, output_model_folder):
  8 |     config_pre_expand = Config(config_dict)
  9 | 
 10 |     # TODO: add preprocessing logic for DoppelGANger (single-chunk?)
 11 |     config_pre_expand["dataset"] = []
 12 |     config_pre_expand["dataset_expand"] = True
 13 |     n_valid_chunks = 0
 14 |     for chunk_id in range(config_pre_expand["n_chunks"]):
 15 |         dataset_folder = os.path.join(
 16 |             input_train_data_folder, f"chunkid-{chunk_id}")
 17 |         if os.path.exists(dataset_folder) and os.path.isdir(dataset_folder):
 18 |             config_pre_expand["dataset"].append(dataset_folder)
 19 |             n_valid_chunks += 1
 20 |     config_pre_expand["n_chunks"] = n_valid_chunks
 21 |     print("Number of valid chunks:", config_pre_expand["n_chunks"])
 22 | 
 23 |     config_post_expand = config_pre_expand.expand()
 24 |     print(
 25 |         f"Number of configurations after expanded: {len(config_post_expand)}")
 26 | 
 27 |     configs = []
 28 |     for config_ in config_post_expand:
 29 |         sub_result_folder = os.path.join(
 30 |             os.path.basename(config_["dataset"]),
 31 |             ",".join("{}-{}".format(k, os.path.basename(str(v)))
 32 |                      for k, v in config_.items()
 33 |                      if f"{k}_expand" in config_.keys() and k != "dataset")
 34 |         )
 35 |         config_["sub_result_folder"] = sub_result_folder
 36 |         config_["result_folder"] = os.path.join(
 37 |             output_model_folder, sub_result_folder)
 38 | 
 39 |         # sanity check
 40 |         if config_["pretrain_non_dp"] and \
 41 |             ((config_["dp_noise_multiplier"] is not None) or
 42 |              (config_["dp_l2_norm_clip"] is not None)):
 43 |             raise ValueError(
 44 |                 "pretrain_non_DP can only be used for non-DP case!")
 45 | 
 46 |         if config_["pretrain_non_dp"] and \
 47 |                 config_["pretrain_non_dp_reduce_time"] is None:
 48 |             raise ValueError(
 49 |                 "pretrain_non_dp=True, "
 50 |                 "then pretrain_non_dp_reduce_time must be set!")
 51 | 
 52 |         if not config_["pretrain_non_dp"] and \
 53 |                 config_["pretrain_non_dp_reduce_time"] is not None:
 54 |             raise ValueError(
 55 |                 "pretrain_non_dp=False, "
 56 |                 "pretrain_non_dp_reduce_time does not need to be set!")
 57 | 
 58 |         if config_["pretrain_non_dp"] and config_["pretrain_dp"]:
 59 |             raise ValueError(
 60 |                 "Only one of pretrain_non_DP and pretrain_DP can be True!")
 61 | 
 62 |         if config_["pretrain_dp"] and config_["pretrain_dir"] is None:
 63 |             raise ValueError(
 64 |                 "You are using DP with pretrained public model, "
 65 |                 "pretrain_dir must be set to the pretrained public model "
 66 |                 "checkpoint directory!")
 67 | 
 68 |         configs.append(config_)
 69 | 
 70 |     return configs
 71 | 
 72 | 
 73 | def get_configid_from_kv(configs, k, v):
 74 |     for idx, config in enumerate(configs):
 75 |         if config[k] == v:
 76 |             return idx
 77 |     raise ValueError("{}: {} not found in configs!".format(k, v))
 78 | 
 79 | 
 80 | def _configs2configsgroup(
 81 |         configs,
 82 |         generation_flag=False,
 83 |         output_syn_data_folder=None):
 84 |     '''
 85 |     # convert a list of configurations to a grouped dictionary
 86 |     # for training purpose
 87 |     # key : value
 88 |     # dp : bool
 89 |     # dp_noise_multiplier: float
 90 |     # pretrain: bool
 91 |     # config_ids: list
 92 |     '''
 93 |     if generation_flag and output_syn_data_folder is None:
 94 |         raise ValueError("Generation phase: "
 95 |                          "output_syn_data_folder must be specified")
 96 | 
 97 |     config_id_list_victim = [i for i in range(len(configs))]
 98 |     config_group_list = []
 99 | 
100 |     for config_id, config in enumerate(configs):
101 |         if config_id in config_id_list_victim:
102 |             config_group = {}
103 |             config_group["dp_noise_multiplier"] = config["dp_noise_multiplier"]
104 |             config_group["dp"] = (config["dp_noise_multiplier"] is not None)
105 |             config_group["pretrain"] = (
106 |                 config["pretrain_non_dp"] or config["pretrain_dp"])
107 |             config_group["config_ids"] = []
108 | 
109 |             num_chunks = config["n_chunks"]
110 |             for chunk_idx in range(num_chunks):
111 |                 config_id_ = get_configid_from_kv(
112 |                     configs=configs,
113 |                     k="result_folder",
114 |                     v=re.sub(
115 |                         'chunkid-[0-9]+',
116 |                         'chunkid-{}'.format(chunk_idx),
117 |                         config["result_folder"]))
118 |                 config_group["config_ids"].append(config_id_)
119 |                 config_id_list_victim.remove(config_id_)
120 | 
121 |             config_group_list.append(config_group)
122 | 
123 |     # sanity check
124 |     assert len(config_id_list_victim) == 0
125 |     config_ids_check = []
126 |     for config_group in config_group_list:
127 |         config_ids_check += config_group["config_ids"]
128 |     assert set(config_ids_check) == set([i for i in range(len(configs))])
129 | 
130 |     # add pretrain_dir etc. to the original configs
131 |     for config_group in config_group_list:
132 |         if not config_group["pretrain"]:
133 |             for config_id in config_group["config_ids"]:
134 |                 configs[config_id]["restore"] = False
135 |         else:
136 |             if not config_group["dp"]:
137 |                 for chunk_id, config_id in enumerate(
138 |                         config_group["config_ids"]):
139 |                     if chunk_id == 0:
140 |                         chunk0_idx = config_id
141 |                         configs[config_id]["restore"] = False
142 |                         epoch_range = list(
143 |                             range(
144 |                                 configs[config_id]["epoch_checkpoint_freq"]-1,
145 |                                 configs[config_id]["epochs"],
146 |                                 configs[config_id]["epoch_checkpoint_freq"]))
147 |                         epoch_range.reverse()
148 | 
149 |                         pretrain_dir = None
150 |                         # use last available ckpt
151 |                         if configs[config_id]["skip_chunk0_train"]:
152 |                             last_epoch_found = False
153 |                             for epoch_id in epoch_range:
154 |                                 if last_epoch_found:
155 |                                     break
156 |                                 ckpt_dir = os.path.join(
157 |                                     configs[config_id]["result_folder"],
158 |                                     "checkpoint",
159 |                                     "epoch_id-{}.pt".format(epoch_id)
160 |                                 )
161 |                                 if os.path.exists(ckpt_dir):
162 |                                     last_epoch_found = True
163 | 
164 |                             if not last_epoch_found:
165 |                                 raise ValueError(
166 |                                     "Skipping chunk0 training but "
167 |                                     "chunk0 has no available ckpt at {}! "
168 |                                     "Please move ckpts into the "
169 |                                     "corresponding folder.".format(
170 |                                         configs[config_id]["result_folder"]))
171 |                             else:
172 |                                 pretrain_dir = ckpt_dir
173 |                         else:
174 |                             if os.path.exists(os.path.join(
175 |                                 configs[config_id]["result_folder"],
176 |                                 "checkpoint")
177 |                             ) and not generation_flag:
178 |                                 raise ValueError(
179 |                                     "Chunk0 training NOT skipped "
180 |                                     "but ckpts already exist! "
181 |                                     "Please change your working folder "
182 |                                     "or clean up the ckpt folder "
183 |                                     "to continute training from scratch.")
184 | 
185 |                             pretrain_dir = os.path.join(
186 |                                 configs[config_id]["result_folder"],
187 |                                 "checkpoint",
188 |                                 "epoch_id-{}.pt".format(
189 |                                     epoch_range[0])
190 |                             )
191 | 
192 |                         configs[config_id]["pretrain_dir"] = pretrain_dir
193 | 
194 |                     else:
195 |                         configs[config_id]["restore"] = True
196 |                         configs[config_id]["pretrain_dir"] = pretrain_dir
197 |                         configs[config_id]["epochs"] = int(
198 |                             configs[config_id]["epochs"] /
199 |                             configs[config_id]["pretrain_non_dp_reduce_time"])
200 | 
201 |             else:
202 |                 for chunk_id, config_id in enumerate(
203 |                         config_group["config_ids"]):
204 |                     configs[config_id]["restore"] = True
205 | 
206 |     # add chunk_id and eval_root_folder for generation related
207 |     if generation_flag:
208 |         for config_group in config_group_list:
209 |             chunk0_idx = config_group["config_ids"][0]
210 |             eval_root_folder = os.path.join(
211 |                 output_syn_data_folder,
212 |                 re.sub(
213 |                     'chunkid-0',
214 |                     '',
215 |                     configs[chunk0_idx]["sub_result_folder"]).strip("/"))
216 |             for chunk_id, config_id in enumerate(config_group["config_ids"]):
217 |                 configs[config_id]["chunk_id"] = chunk_id
218 |                 configs[config_id]["eval_root_folder"] = eval_root_folder
219 | 
220 |     for config in configs:
221 |         os.makedirs(config["result_folder"], exist_ok=True)
222 |         if generation_flag:
223 |             os.makedirs(config["eval_root_folder"], exist_ok=True)
224 | 
225 |     return configs, config_group_list
226 | 


--------------------------------------------------------------------------------
/netshare/model_managers/netshare_manager/train_helper.py:
--------------------------------------------------------------------------------
  1 | import netshare.ray as ray
  2 | import os
  3 | 
  4 | 
  5 | @ray.remote(scheduling_strategy="SPREAD", max_calls=1)
  6 | def _launch_one_chunk_training(
  7 |         create_new_model, configs, config_idx, input_train_data_folder,
  8 |         output_model_folder, log_folder):
  9 |     model = create_new_model(configs[config_idx])
 10 |     obj = model.train(input_train_data_folder, output_model_folder, log_folder)
 11 |     return obj
 12 | 
 13 | 
 14 | def _launch_other_chunks_training(
 15 |         create_new_model, configs, config_ids, input_train_data_folder,
 16 |         output_model_folder, log_folder):
 17 |     chunk0_idx = config_ids[0]
 18 |     if configs[chunk0_idx]["skip_chunk0_train"] and configs[chunk0_idx][
 19 |             "pretrain_dir"] is None:
 20 |         raise ValueError(
 21 |             "Skipping chunk0 training but chunk0 has no available ckpt!"
 22 |             "Please move ckpts into the corresponding folder.")
 23 |     objs = []
 24 |     for config_idx in config_ids[1:]:
 25 |         # sanity check
 26 |         if not os.path.exists(configs[config_idx]["pretrain_dir"]):
 27 |             raise ValueError(
 28 |                 f"Pretrain_dir {configs[config_idx]['pretrain_dir']} does not exist!")
 29 |         objs.append(
 30 |             _launch_one_chunk_training.remote(
 31 |                 create_new_model,
 32 |                 configs,
 33 |                 config_idx,
 34 |                 input_train_data_folder,
 35 |                 output_model_folder,
 36 |                 log_folder))
 37 | 
 38 |     results = ray.get(objs)
 39 |     return results
 40 | 
 41 | 
 42 | def _launch_all_chunks_training(
 43 |         create_new_model, configs, config_ids, input_train_data_folder,
 44 |         output_model_folder, log_folder):
 45 |     objs = []
 46 |     for config_idx in config_ids:
 47 |         # sanity check
 48 |         if not os.path.exists(configs[config_idx]["pretrain_dir"]):
 49 |             raise ValueError("Pretrain_dir {} does not exist!")
 50 |         objs.append(
 51 |             _launch_one_chunk_training.remote(
 52 |                 create_new_model,
 53 |                 configs,
 54 |                 config_idx,
 55 |                 input_train_data_folder,
 56 |                 output_model_folder,
 57 |                 log_folder))
 58 | 
 59 |     results = ray.get(objs)
 60 |     return results
 61 | 
 62 | 
 63 | @ray.remote(scheduling_strategy="SPREAD")
 64 | def _train_specific_config_group(
 65 |         create_new_model,
 66 |         config_group_id,
 67 |         config_group,
 68 |         configs,
 69 |         input_train_data_folder,
 70 |         output_model_folder,
 71 |         log_folder):
 72 |     print(
 73 |         "Config group {}: DP: {}, pretrain: {}".format(
 74 |             config_group_id, config_group["dp"], config_group["pretrain"]
 75 |         )
 76 |     )
 77 |     config_ids = config_group["config_ids"]
 78 |     if config_group["dp"] == False and config_group["pretrain"] == True:
 79 |         chunk0_idx = config_ids[0]
 80 |         if configs[chunk0_idx]["skip_chunk0_train"] == True:
 81 |             print("Skipping chunk0 training...")
 82 |         else:
 83 |             print("Start launching chunk0 experiments...")
 84 |             # launch first chunk
 85 |             config_idx = config_ids[0]
 86 |             result = ray.get(
 87 |                 _launch_one_chunk_training.remote(
 88 |                     create_new_model,
 89 |                     configs,
 90 |                     config_idx,
 91 |                     input_train_data_folder,
 92 |                     output_model_folder,
 93 |                     log_folder))
 94 | 
 95 |             print("Finish launching chunk0 experiments ...")
 96 | 
 97 |         if len(configs) > 1:
 98 |             print(
 99 |                 f"Start waiting for other chunks from config_group_id {config_group_id} experiments finished ...")
100 |             results = _launch_other_chunks_training(
101 |                 create_new_model,
102 |                 configs,
103 |                 config_ids,
104 |                 input_train_data_folder,
105 |                 output_model_folder,
106 |                 log_folder)
107 |             print(f"Other chunks from config_group_id {config_group_id} training finished")
108 | 
109 |     else:
110 |         print("Launching all chunks experiments...")
111 |         # Haven't been tested
112 |         results = _launch_all_chunks_training(
113 |             create_new_model,
114 |             configs,
115 |             config_ids,
116 |             input_train_data_folder,
117 |             output_model_folder,
118 |             log_folder)
119 | 
120 |     return True
121 | 


--------------------------------------------------------------------------------
/netshare/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import Model
2 | from .doppelganger_torch_model import DoppelGANgerTorchModel
3 | 
4 | __all__ = ['Model', 'DoppelGANgerTorchModel']
5 | 


--------------------------------------------------------------------------------
/netshare/models/doppelganger_torch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netsharecmu/NetShare/af026037a88db486069209e2258e11c2df1b93e2/netshare/models/doppelganger_torch/__init__.py


--------------------------------------------------------------------------------
/netshare/models/doppelganger_torch/load_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import math
 3 | import numpy as np
 4 | import pickle
 5 | 
 6 | 
 7 | def load_data(path, sample_len, flag="train"):
 8 | 
 9 |     data_npz = np.load(os.path.join(path, "data_{}.npz".format(flag)))
10 |     with open(os.path.join(path, "data_feature_output.pkl"), "rb") as f:
11 |         data_feature_outputs = pickle.load(f)
12 |     with open(os.path.join(path, "data_attribute_output.pkl"), "rb") as f:
13 |         data_attribute_outputs = pickle.load(f)
14 | 
15 |     data_feature = data_npz["data_feature"]
16 |     data_attribute = data_npz["data_attribute"]
17 |     data_gen_flag = data_npz["data_gen_flag"]
18 | 
19 |     # Append data_feature and data_gen_flag to multiple of sample_len
20 |     timeseries_len = data_feature.shape[1]
21 |     ceil_timeseries_len = math.ceil(timeseries_len / sample_len) * sample_len
22 |     data_feature = np.pad(
23 |         data_feature,
24 |         pad_width=((0, 0),
25 |                    (0, ceil_timeseries_len - timeseries_len),
26 |                    (0, 0)),
27 |         mode='constant', constant_values=0)
28 |     data_gen_flag = np.pad(
29 |         data_gen_flag,
30 |         pad_width=((0, 0),
31 |                    (0, ceil_timeseries_len - timeseries_len)),
32 |         mode='constant', constant_values=0)
33 | 
34 |     return (
35 |         data_feature,
36 |         data_attribute,
37 |         data_gen_flag,
38 |         data_feature_outputs,
39 |         data_attribute_outputs,
40 |     )
41 | 


--------------------------------------------------------------------------------
/netshare/models/doppelganger_torch/privacy_util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """
 17 | Command-line script for computing privacy of a model trained with DP-SGD.
 18 | The script applies the RDP accountant to estimate privacy budget of an iterated
 19 | Sampled Gaussian Mechanism.
 20 | The code is mainly based on Google's TF Privacy:
 21 | https://github.com/tensorflow/privacy/blob/master/tensorflow_privacy/privacy/analysis/compute_dp_sgd_privacy.py
 22 | Example:
 23 |     To call this script from command line, you can enter:
 24 |     $ python -m opacus.scripts.compute_dp_sgd_privacy --epochs=3 --delta=1e-5 --sample-rate 0.01 --noise-multiplier 1.0 --alphas 2 5 10 20 100
 25 |     DP-SGD with
 26 |     - sampling rate = 1%,
 27 |     - noise_multiplier = 1.0,
 28 |     - iterated over 300 steps
 29 |     satisfies differential privacy with
 30 |     - epsilon = 2.39,
 31 |     - delta = 1e-05.
 32 |     The optimal alpha is 5.0.
 33 | """
 34 | import argparse
 35 | import math
 36 | from typing import List, Tuple
 37 | 
 38 | from opacus.accountants.analysis.rdp import compute_rdp, get_privacy_spent
 39 | 
 40 | 
 41 | def _apply_dp_sgd_analysis(
 42 |     *,
 43 |     sample_rate: float,
 44 |     noise_multiplier: float,
 45 |     steps: int,
 46 |     alphas: List[float],
 47 |     delta: float,
 48 |     verbose: bool = True,
 49 | ) -> Tuple[float, float]:
 50 |     """
 51 |     Computes the privacy Epsilon at a given delta via RDP accounting and
 52 |     converting to an (epsilon, delta) guarantee for a target Delta.
 53 |     Args:
 54 |         sample_rate : The sample rate in SGD
 55 |         noise_multiplier : The ratio of the standard deviation of the Gaussian
 56 |             noise to the L2-sensitivity of the function to which the noise is added
 57 |         steps : The number of steps
 58 |         alphas : A list of RDP orders
 59 |         delta : Target delta
 60 |         verbose : If enabled, will print the results of DP-SGD analysis
 61 |     Returns:
 62 |         Pair of privacy loss epsilon and optimal order alpha
 63 |     """
 64 |     rdp = compute_rdp(
 65 |         q=sample_rate, noise_multiplier=noise_multiplier, steps=steps, orders=alphas
 66 |     )
 67 |     eps, opt_alpha = get_privacy_spent(orders=alphas, rdp=rdp, delta=delta)
 68 | 
 69 |     if verbose:
 70 |         print(
 71 |             f"DP-SGD with\n\tsampling rate = {100 * sample_rate:.3g}%,"
 72 |             f"\n\tnoise_multiplier = {noise_multiplier},"
 73 |             f"\n\titerated over {steps} steps,\nsatisfies "
 74 |             f"differential privacy with\n\tepsilon = {eps:.3g},"
 75 |             f"\n\tdelta = {delta}."
 76 |             f"\nThe optimal alpha is {opt_alpha}."
 77 |         )
 78 | 
 79 |         if opt_alpha == max(alphas) or opt_alpha == min(alphas):
 80 |             print(
 81 |                 "The privacy estimate is likely to be improved by expanding "
 82 |                 "the set of alpha orders."
 83 |             )
 84 |     return eps, opt_alpha
 85 | 
 86 | 
 87 | def compute_dp_sgd_privacy(
 88 |     *,
 89 |     sample_rate: float,
 90 |     noise_multiplier: float,
 91 |     epochs: int,
 92 |     delta: float,
 93 |     alphas: List[float],
 94 |     verbose: bool = True,
 95 | ) -> Tuple[float, float]:
 96 |     """
 97 |     Performs the DP-SGD privacy analysis.
 98 |     Finds sample rate and number of steps based on the input parameters, and calls
 99 |     DP-SGD privacy analysis to find the privacy loss epsilon and optimal order alpha.
100 |     Args:
101 |         sample_rate : probability of each sample from the dataset to be selected for a next batch
102 |         noise_multiplier : The ratio of the standard deviation of the Gaussian noise
103 |             to the L2-sensitivity of the function to which the noise is added
104 |         epochs : Number of epochs
105 |         delta : Target delta
106 |         alphas : A list of RDP orders
107 |         verbose : If enabled, will print the results of DP-SGD analysis
108 |     Returns:
109 |         Pair of privacy loss epsilon and optimal order alpha
110 |     Raises:
111 |         ValueError
112 |             When batch size is greater than sample size
113 |     """
114 |     if sample_rate > 1:
115 |         raise ValueError("sample_rate must be no greater than 1")
116 |     steps = epochs * math.ceil(1 / sample_rate)
117 | 
118 |     return _apply_dp_sgd_analysis(
119 |         sample_rate=sample_rate,
120 |         noise_multiplier=noise_multiplier,
121 |         steps=steps,
122 |         alphas=alphas,
123 |         delta=delta,
124 |         verbose=verbose,
125 |     )
126 | 
127 | 
128 | def main():
129 |     parser = argparse.ArgumentParser(
130 |         description="Estimate privacy of a model trained with DP-SGD using RDP accountant",
131 |     )
132 |     parser.add_argument(
133 |         "-r",
134 |         "--sample-rate",
135 |         type=float,
136 |         required=True,
137 |         help="Input sample rate (probability of each sample from the dataset to be selected for a next batch)",
138 |     )
139 |     parser.add_argument(
140 |         "-n",
141 |         "--noise-multiplier",
142 |         type=float,
143 |         required=True,
144 |         help="Noise multiplier",
145 |     )
146 |     parser.add_argument(
147 |         "-e",
148 |         "--epochs",
149 |         type=int,
150 |         required=True,
151 |         help="Number of epochs to train",
152 |     )
153 |     parser.add_argument(
154 |         "-d", "--delta", type=float, default=1e-5, help="Targeted delta (default: 1e-5)"
155 |     )
156 |     parser.add_argument(
157 |         "-a",
158 |         "--alphas",
159 |         action="store",
160 |         dest="alphas",
161 |         type=float,
162 |         nargs="+",
163 |         default=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)),
164 |         help="List of alpha values (alpha orders of Renyi-DP evaluation). "
165 |         "A default list is provided. Else, space separated numbers. E.g.,"
166 |         "-a 10 100",
167 |     )
168 | 
169 |     args = parser.parse_args()
170 | 
171 |     compute_dp_sgd_privacy(
172 |         sample_rate=args.sample_rate,
173 |         noise_multiplier=args.noise_multiplier,
174 |         epochs=args.epochs,
175 |         delta=args.delta,
176 |         alphas=args.alphas,
177 |     )
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     main()
182 | 


--------------------------------------------------------------------------------
/netshare/models/doppelganger_torch/util.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | from netshare.utils import OutputType, Output, Normalization
  3 | import numpy as np
  4 | import matplotlib
  5 | 
  6 | matplotlib.use("Agg")
  7 | 
  8 | 
  9 | def renormalize_per_sample(
 10 |     data_feature,
 11 |     data_attribute,
 12 |     data_feature_outputs,
 13 |     data_attribute_outputs,
 14 |     gen_flags,
 15 |     num_real_attribute,
 16 | ):
 17 |     attr_dim = 0
 18 |     for i in range(num_real_attribute):
 19 |         attr_dim += data_attribute_outputs[i].dim
 20 |     attr_dim_cp = attr_dim
 21 | 
 22 |     fea_dim = 0
 23 |     for output in data_feature_outputs:
 24 |         if output.type_ == OutputType.CONTINUOUS:
 25 |             for _ in range(output.dim):
 26 |                 max_plus_min_d_2 = data_attribute[:, attr_dim]
 27 |                 max_minus_min_d_2 = data_attribute[:, attr_dim + 1]
 28 |                 attr_dim += 2
 29 | 
 30 |                 max_ = max_plus_min_d_2 + max_minus_min_d_2
 31 |                 min_ = max_plus_min_d_2 - max_minus_min_d_2
 32 | 
 33 |                 max_ = np.expand_dims(max_, axis=1)
 34 |                 min_ = np.expand_dims(min_, axis=1)
 35 | 
 36 |                 if output.normalization == Normalization.MINUSONE_ONE:
 37 |                     data_feature[:, :, fea_dim] = (
 38 |                         data_feature[:, :, fea_dim] + 1.0
 39 |                     ) / 2.0
 40 | 
 41 |                 data_feature[:, :, fea_dim] = (
 42 |                     data_feature[:, :, fea_dim] * (max_ - min_) + min_
 43 |                 )
 44 | 
 45 |                 fea_dim += 1
 46 |         else:
 47 |             fea_dim += output.dim
 48 | 
 49 |     tmp_gen_flags = np.expand_dims(gen_flags, axis=2)
 50 |     data_feature = data_feature * tmp_gen_flags
 51 | 
 52 |     data_attribute = data_attribute[:, 0:attr_dim_cp]
 53 | 
 54 |     return data_feature, data_attribute
 55 | 
 56 | 
 57 | def normalize_per_sample(
 58 |         data_feature, data_attribute, data_feature_outputs,
 59 |         data_attribute_outputs, eps=1e-4):
 60 |     # assume all samples have maximum length
 61 |     data_feature_min = np.amin(data_feature, axis=1)
 62 |     data_feature_max = np.amax(data_feature, axis=1)
 63 | 
 64 |     additional_attribute = []
 65 |     additional_attribute_outputs = []
 66 | 
 67 |     dim = 0
 68 |     for output in data_feature_outputs:
 69 |         if output.type_ == OutputType.CONTINUOUS:
 70 |             for _ in range(output.dim):
 71 |                 max_ = data_feature_max[:, dim] + eps
 72 |                 min_ = data_feature_min[:, dim] - eps
 73 | 
 74 |                 additional_attribute.append((max_ + min_) / 2.0)
 75 |                 additional_attribute.append((max_ - min_) / 2.0)
 76 |                 additional_attribute_outputs.append(
 77 |                     Output(
 78 |                         type_=OutputType.CONTINUOUS,
 79 |                         dim=1,
 80 |                         normalization=output.normalization,
 81 |                         is_gen_flag=False,
 82 |                     )
 83 |                 )
 84 |                 additional_attribute_outputs.append(
 85 |                     Output(
 86 |                         type_=OutputType.CONTINUOUS,
 87 |                         dim=1,
 88 |                         normalization=Normalization.ZERO_ONE,
 89 |                         is_gen_flag=False,
 90 |                     )
 91 |                 )
 92 | 
 93 |                 max_ = np.expand_dims(max_, axis=1)
 94 |                 min_ = np.expand_dims(min_, axis=1)
 95 | 
 96 |                 data_feature[:, :, dim] = (data_feature[:, :, dim] - min_) / (
 97 |                     max_ - min_
 98 |                 )
 99 |                 if output.normalization == Normalization.MINUSONE_ONE:
100 |                     data_feature[:, :, dim] = data_feature[:,
101 |                                                            :, dim] * 2.0 - 1.0
102 | 
103 |                 dim += 1
104 |         else:
105 |             dim += output.dim
106 | 
107 |     real_attribute_mask = [True] * len(data_attribute_outputs) + [False] * len(
108 |         additional_attribute_outputs
109 |     )
110 | 
111 |     additional_attribute = np.stack(additional_attribute, axis=1)
112 |     data_attribute = np.concatenate(
113 |         [data_attribute, additional_attribute], axis=1)
114 |     data_attribute_outputs.extend(additional_attribute_outputs)
115 | 
116 |     return data_feature, data_attribute, data_attribute_outputs, real_attribute_mask
117 | 
118 | 
119 | def add_gen_flag(data_feature, data_gen_flag, data_feature_outputs,
120 |                  sample_len):
121 |     for output in data_feature_outputs:
122 |         if output.is_gen_flag:
123 |             raise Exception("is_gen_flag should be False for all"
124 |                             "feature_outputs")
125 | 
126 |     if (data_feature.shape[2] !=
127 |             np.sum([t.dim for t in data_feature_outputs])):
128 |         raise Exception("feature dimension does not match feature_outputs")
129 | 
130 |     if len(data_gen_flag.shape) != 2:
131 |         raise Exception("data_gen_flag should be 2 dimension")
132 | 
133 |     num_sample, length = data_gen_flag.shape
134 | 
135 |     data_gen_flag = np.expand_dims(data_gen_flag, 2)
136 | 
137 |     data_feature_outputs.append(Output(
138 |         type_=OutputType.DISCRETE,
139 |         dim=2,
140 |         is_gen_flag=True))
141 | 
142 |     shift_gen_flag = np.concatenate(
143 |         [data_gen_flag[:, 1:, :],
144 |          np.zeros((data_gen_flag.shape[0], 1, 1))],
145 |         axis=1)
146 |     if length % sample_len != 0:
147 |         raise Exception("length must be a multiple of sample_len")
148 |     data_gen_flag_t = np.reshape(
149 |         data_gen_flag,
150 |         [num_sample, int(length / sample_len), sample_len])
151 |     data_gen_flag_t = np.sum(data_gen_flag_t, 2)
152 |     data_gen_flag_t = data_gen_flag_t > 0.5
153 |     data_gen_flag_t = np.repeat(data_gen_flag_t, sample_len, axis=1)
154 |     data_gen_flag_t = np.expand_dims(data_gen_flag_t, 2)
155 |     data_feature = np.concatenate(
156 |         [data_feature,
157 |          shift_gen_flag,
158 |          (1 - shift_gen_flag) * data_gen_flag_t],
159 |         axis=2)
160 | 
161 |     return data_feature, data_feature_outputs
162 | 
163 | def reverse_gen_flag(gen_flags):
164 |     gen_flags = np.concatenate((np.ones((gen_flags.shape[0], 1)), gen_flags[:, :-1]), axis=1)
165 |     return gen_flags


--------------------------------------------------------------------------------
/netshare/models/doppelganger_torch_model.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import json
  4 | import inspect
  5 | import numpy as np
  6 | 
  7 | from .model import Model
  8 | from netshare.utils import output
  9 | # from gan import output  # NOQA
 10 | # sys.modules["output"] = output  # NOQA
 11 | from .doppelganger_torch.doppelganger import DoppelGANger  # NOQA
 12 | from .doppelganger_torch.util import add_gen_flag, normalize_per_sample, renormalize_per_sample, reverse_gen_flag  # NOQA
 13 | from .doppelganger_torch.load_data import load_data  # NOQA
 14 | 
 15 | 
 16 | class DoppelGANgerTorchModel(Model):
 17 |     def _train(self, input_train_data_folder, output_model_folder, log_folder):
 18 |         print(f"{self.__class__.__name__}.{inspect.stack()[0][3]}")
 19 | 
 20 |         self._config["result_folder"] = getattr(
 21 |             self._config, "result_folder", output_model_folder)
 22 |         self._config["dataset"] = getattr(
 23 |             self._config, "dataset", input_train_data_folder)
 24 | 
 25 |         print("Currently training with config:", self._config)
 26 |         # save config to the result folder
 27 |         with open(os.path.join(
 28 |                 self._config["result_folder"],
 29 |                 "config.json"), 'w') as fout:
 30 |             json.dump(self._config, fout)
 31 | 
 32 |         # load data
 33 |         (
 34 |             data_feature,
 35 |             data_attribute,
 36 |             data_gen_flag,
 37 |             data_feature_outputs,
 38 |             data_attribute_outputs,
 39 |         ) = load_data(
 40 |             path=self._config["dataset"],
 41 |             sample_len=self._config["sample_len"])
 42 |         num_real_attribute = len(data_attribute_outputs)
 43 | 
 44 |         # self-norm if applicable
 45 |         if self._config["self_norm"]:
 46 |             (
 47 |                 data_feature,
 48 |                 data_attribute,
 49 |                 data_attribute_outputs,
 50 |                 real_attribute_mask
 51 |             ) = normalize_per_sample(
 52 |                 data_feature,
 53 |                 data_attribute,
 54 |                 data_feature_outputs,
 55 |                 data_attribute_outputs)
 56 |         else:
 57 |             real_attribute_mask = [True] * num_real_attribute
 58 | 
 59 |         data_feature, data_feature_outputs = add_gen_flag(
 60 |             data_feature, data_gen_flag, data_feature_outputs, self._config["sample_len"]
 61 |         )
 62 | 
 63 |         # create directories
 64 |         checkpoint_dir = os.path.join(
 65 |             self._config["result_folder"],
 66 |             "checkpoint")
 67 |         if not os.path.exists(checkpoint_dir):
 68 |             os.makedirs(checkpoint_dir)
 69 |         sample_dir = os.path.join(self._config["result_folder"], "sample")
 70 |         if not os.path.exists(sample_dir):
 71 |             os.makedirs(sample_dir)
 72 |         time_path = os.path.join(self._config["result_folder"], "time.txt")
 73 | 
 74 |         dg = DoppelGANger(
 75 |             checkpoint_dir=checkpoint_dir,
 76 |             sample_dir=None,
 77 |             time_path=time_path,
 78 |             batch_size=self._config["batch_size"],
 79 |             real_attribute_mask=real_attribute_mask,
 80 |             max_sequence_len=data_feature.shape[1],
 81 |             sample_len=self._config["sample_len"],
 82 |             data_feature_outputs=data_feature_outputs,
 83 |             data_attribute_outputs=data_attribute_outputs,
 84 |             vis_freq=self._config["vis_freq"],
 85 |             vis_num_sample=self._config["vis_num_sample"],
 86 |             d_rounds=self._config["d_rounds"],
 87 |             g_rounds=self._config["g_rounds"],
 88 |             d_gp_coe=self._config["d_gp_coe"],
 89 |             num_packing=self._config["num_packing"],
 90 |             use_attr_discriminator=self._config["use_attr_discriminator"],
 91 |             attr_d_gp_coe=self._config["attr_d_gp_coe"],
 92 |             g_attr_d_coe=self._config["g_attr_d_coe"],
 93 |             epoch_checkpoint_freq=self._config["epoch_checkpoint_freq"],
 94 |             attribute_latent_dim=self._config["attribute_latent_dim"],
 95 |             feature_latent_dim=self._config["feature_latent_dim"],
 96 |             g_lr=self._config["g_lr"],
 97 |             g_beta1=self._config["g_beta1"],
 98 |             d_lr=self._config["d_lr"],
 99 |             d_beta1=self._config["d_beta1"],
100 |             attr_d_lr=self._config["attr_d_lr"],
101 |             attr_d_beta1=self._config["attr_d_beta1"],
102 |             adam_eps=self._config["adam_eps"],
103 |             adam_amsgrad=self._config["adam_amsgrad"],
104 |             generator_attribute_num_units=self._config["generator_attribute_num_units"],
105 |             generator_attribute_num_layers=self._config["generator_attribute_num_layers"],
106 |             generator_feature_num_units=self._config["generator_feature_num_units"],
107 |             generator_feature_num_layers=self._config["generator_feature_num_layers"],
108 |             use_adaptive_rolling=self._config["use_adaptive_rolling"],
109 |             discriminator_num_layers=self._config["discriminator_num_layers"],
110 |             discriminator_num_units=self._config["discriminator_num_units"],
111 |             attr_discriminator_num_layers=self._config["attr_discriminator_num_layers"],
112 |             attr_discriminator_num_units=self._config["attr_discriminator_num_units"],
113 |             restore=getattr(self._config, "restore", False),
114 |             pretrain_dir=self._config["pretrain_dir"]
115 |         )
116 | 
117 |         dg.train(
118 |             epochs=self._config["epochs"],
119 |             data_feature=data_feature,
120 |             data_attribute=data_attribute,
121 |             data_gen_flag=data_gen_flag,
122 |         )
123 | 
124 |     def _generate(self, input_train_data_folder,
125 |                   input_model_folder, output_syn_data_folder, log_folder):
126 |         print(f"{self.__class__.__name__}.{inspect.stack()[0][3]}")
127 | 
128 |         self._config["result_folder"] = getattr(
129 |             self._config, "result_folder", input_model_folder)
130 |         self._config["dataset"] = getattr(
131 |             self._config, "dataset", input_train_data_folder)
132 | 
133 |         print("Currently generating with config:", self._config)
134 | 
135 |         # load data
136 |         (
137 |             data_feature,
138 |             data_attribute,
139 |             data_gen_flag,
140 |             data_feature_outputs,
141 |             data_attribute_outputs,
142 |         ) = load_data(
143 |             path=self._config["dataset"],
144 |             sample_len=self._config["sample_len"])
145 |         num_real_attribute = len(data_attribute_outputs)
146 | 
147 |         # self-norm if applicable
148 |         if self._config["self_norm"]:
149 |             (
150 |                 data_feature,
151 |                 data_attribute,
152 |                 data_attribute_outputs,
153 |                 real_attribute_mask
154 |             ) = normalize_per_sample(
155 |                 data_feature,
156 |                 data_attribute,
157 |                 data_feature_outputs,
158 |                 data_attribute_outputs)
159 |         else:
160 |             real_attribute_mask = [True] * num_real_attribute
161 | 
162 |         data_feature, data_feature_outputs = add_gen_flag(
163 |             data_feature, data_gen_flag, data_feature_outputs, self._config["sample_len"]
164 |         )
165 | 
166 |         # create directories
167 |         checkpoint_dir = os.path.join(
168 |             self._config["result_folder"],
169 |             "checkpoint")
170 |         if not os.path.exists(checkpoint_dir):
171 |             os.makedirs(checkpoint_dir)
172 |         sample_dir = os.path.join(self._config["result_folder"], "sample")
173 |         if not os.path.exists(sample_dir):
174 |             os.makedirs(sample_dir)
175 |         time_path = os.path.join(self._config["result_folder"], "time.txt")
176 | 
177 |         dg = DoppelGANger(
178 |             checkpoint_dir=checkpoint_dir,
179 |             sample_dir=None,
180 |             time_path=time_path,
181 |             batch_size=self._config["batch_size"],
182 |             real_attribute_mask=real_attribute_mask,
183 |             max_sequence_len=data_feature.shape[1],
184 |             sample_len=self._config["sample_len"],
185 |             data_feature_outputs=data_feature_outputs,
186 |             data_attribute_outputs=data_attribute_outputs,
187 |             vis_freq=self._config["vis_freq"],
188 |             vis_num_sample=self._config["vis_num_sample"],
189 |             d_rounds=self._config["d_rounds"],
190 |             g_rounds=self._config["g_rounds"],
191 |             d_gp_coe=self._config["d_gp_coe"],
192 |             num_packing=self._config["num_packing"],
193 |             use_attr_discriminator=self._config["use_attr_discriminator"],
194 |             attr_d_gp_coe=self._config["attr_d_gp_coe"],
195 |             g_attr_d_coe=self._config["g_attr_d_coe"],
196 |             epoch_checkpoint_freq=self._config["epoch_checkpoint_freq"],
197 |             attribute_latent_dim=self._config["attribute_latent_dim"],
198 |             feature_latent_dim=self._config["feature_latent_dim"],
199 |             g_lr=self._config["g_lr"],
200 |             g_beta1=self._config["g_beta1"],
201 |             d_lr=self._config["d_lr"],
202 |             d_beta1=self._config["d_beta1"],
203 |             attr_d_lr=self._config["attr_d_lr"],
204 |             attr_d_beta1=self._config["attr_d_beta1"],
205 |             adam_eps=self._config["adam_eps"],
206 |             adam_amsgrad=self._config["adam_amsgrad"],
207 |             generator_attribute_num_units=self._config["generator_attribute_num_units"],
208 |             generator_attribute_num_layers=self._config["generator_attribute_num_layers"],
209 |             generator_feature_num_units=self._config["generator_feature_num_units"],
210 |             generator_feature_num_layers=self._config["generator_feature_num_layers"],
211 |             use_adaptive_rolling=self._config["use_adaptive_rolling"],
212 |             discriminator_num_layers=self._config["discriminator_num_layers"],
213 |             discriminator_num_units=self._config["discriminator_num_units"],
214 |             attr_discriminator_num_layers=self._config["attr_discriminator_num_layers"],
215 |             attr_discriminator_num_units=self._config["attr_discriminator_num_units"],
216 |             restore=getattr(self._config, "restore", False),
217 |             pretrain_dir=self._config["pretrain_dir"]
218 |         )
219 | 
220 |         if self._config["given_data_attribute_flag"]:
221 |             print("Generating from a given data attribute!")
222 |             given_attr_npz_file = os.path.join(
223 |                 output_syn_data_folder,
224 |                 "attr_clean",
225 |                 "chunk_id-{}.npz".format(self._config["chunk_id"]))
226 | 
227 |             if not os.path.exists(given_attr_npz_file):
228 |                 raise ValueError(
229 |                     f"Given data attribute file {given_attr_npz_file}")
230 |             given_data_attribute = np.load(given_attr_npz_file)[
231 |                 "data_attribute"]
232 |             given_data_attribute_discrete = np.load(given_attr_npz_file)[
233 |                 "data_attribute_discrete"]
234 |             # print("given_data_attribute:", given_data_attribute.shape)
235 |             # print("given_data_attribute_discrete:",
236 |             #       given_data_attribute_discrete)
237 |         else:
238 |             print("Generating w/o given data attribute!")
239 |             given_data_attribute = None
240 |             given_data_attribute_discrete = None
241 | 
242 |         last_iteration_found = False
243 |         epoch_range = list(
244 |             range(
245 |                 self._config["epoch_checkpoint_freq"] - 1,
246 |                 self._config["epochs"],
247 |                 self._config["epoch_checkpoint_freq"],
248 |             )
249 |         )
250 |         # reverse list in place
251 |         epoch_range.reverse()
252 |         generatedSamples_per_epoch = 1
253 | 
254 |         for epoch_id in epoch_range:
255 |             if last_iteration_found and \
256 |                     not self._config["given_data_attribute_flag"] and getattr(self._config, "n_chunks") > 1:
257 |                 break
258 | 
259 |             print("Processing epoch_id: {}".format(epoch_id))
260 |             mid_checkpoint_dir = os.path.join(
261 |                 checkpoint_dir, "epoch_id-{}.pt".format(epoch_id)
262 |             )
263 |             if not os.path.exists(mid_checkpoint_dir):
264 |                 print("Not found {}".format(mid_checkpoint_dir))
265 |                 continue
266 |             else:
267 |                 last_iteration_found = True
268 |             for generated_samples_idx in range(generatedSamples_per_epoch):
269 |                 print(
270 |                     "generate {}-th sample from epoch_id-{}".format(
271 |                         generated_samples_idx + 1, epoch_id
272 |                     )
273 |                 )
274 | 
275 |                 num_samples = (data_attribute.shape[0] if ((given_data_attribute is None) and (
276 |                     given_data_attribute_discrete is None)) else given_data_attribute.shape[0])
277 | 
278 |                 dg.load(mid_checkpoint_dir)
279 |                 print("Finished loading")
280 | 
281 |                 (
282 |                     features,
283 |                     attributes,
284 |                     attributes_discrete,
285 |                     gen_flags,
286 |                     lengths
287 |                 ) = dg.generate(
288 |                     num_samples=num_samples,
289 |                     given_attribute=given_data_attribute,
290 |                     given_attribute_discrete=given_data_attribute_discrete)
291 | 
292 |                 gen_flags = reverse_gen_flag(gen_flags)
293 | 
294 |                 if self._config["self_norm"]:
295 |                     features, attributes = renormalize_per_sample(
296 |                         features,
297 |                         attributes,
298 |                         data_feature_outputs,
299 |                         data_attribute_outputs,
300 |                         gen_flags,
301 |                         num_real_attribute=num_real_attribute,
302 |                     )
303 | 
304 |                     print(features.shape)
305 |                     print(attributes.shape)
306 | 
307 |                 if getattr(self._config, "save_without_chunk", False) or getattr(self._config, "n_chunks") == 1:
308 |                     save_path = os.path.join(
309 |                         output_syn_data_folder,
310 |                         "feat_raw",
311 |                         "chunk_id-0")
312 |                     os.makedirs(save_path, exist_ok=True)
313 |                     np.savez(
314 |                         os.path.join(
315 |                             save_path,
316 |                             f"epoch_id-{epoch_id}.npz"),
317 |                         data_attribute=attributes,
318 |                         data_feature=features,
319 |                         data_gen_flag=gen_flags)
320 |                 elif not self._config["given_data_attribute_flag"]:
321 |                     save_path = os.path.join(
322 |                         output_syn_data_folder, "attr_raw")
323 |                     os.makedirs(save_path, exist_ok=True)
324 |                     np.savez(
325 |                         os.path.join(
326 |                             save_path,
327 |                             "chunk_id-{}.npz".format(
328 |                                 self._config["chunk_id"])
329 |                         ),
330 |                         data_attribute=attributes,
331 |                         data_attribute_discrete=attributes_discrete
332 |                     )
333 |                     print(os.path.join(
334 |                         save_path,
335 |                         "chunk_id-{}.npz".format(
336 |                             self._config["chunk_id"])
337 |                     ))
338 |                 else:
339 |                     save_path = os.path.join(
340 |                         output_syn_data_folder,
341 |                         "feat_raw",
342 |                         f"chunk_id-{self._config['chunk_id']}")
343 |                     os.makedirs(save_path, exist_ok=True)
344 |                     np.savez(
345 |                         os.path.join(
346 |                             save_path,
347 |                             f"epoch_id-{epoch_id}.npz"
348 |                         ),
349 |                         data_attribute=attributes,
350 |                         data_feature=features,
351 |                         data_gen_flag=gen_flags,
352 |                         config=self._config
353 |                     )
354 | 


--------------------------------------------------------------------------------
/netshare/models/model.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | import os
 3 | 
 4 | from netshare.utils import Tee
 5 | 
 6 | 
 7 | class Model(ABC):
 8 |     def __init__(self, config):
 9 |         self._config = config
10 | 
11 |     @abstractmethod
12 |     def _train(self, input_train_data_folder, output_model_folder, log_folder):
13 |         ...
14 | 
15 |     @abstractmethod
16 |     def _generate(self, input_train_data_folder,
17 |                   input_model_folder, output_syn_data_folder, log_folder):
18 |         ...
19 | 
20 |     def train(self, input_train_data_folder, output_model_folder, log_folder):
21 |         stdout_log_path = os.path.join(log_folder, 'model.train.stdout.log')
22 |         stderr_log_path = os.path.join(log_folder, 'model.train.stderr.log')
23 |         with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path):
24 |             return self._train(
25 |                 input_train_data_folder=input_train_data_folder,
26 |                 output_model_folder=output_model_folder,
27 |                 log_folder=log_folder)
28 | 
29 |     def generate(self, input_train_data_folder, input_model_folder,
30 |                  output_syn_data_folder, log_folder):
31 |         stdout_log_path = os.path.join(log_folder, 'model.generate.stdout.log')
32 |         stderr_log_path = os.path.join(log_folder, 'model.generate.stderr.log')
33 |         with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path):
34 |             return self._generate(
35 |                 input_train_data_folder=input_train_data_folder,
36 |                 input_model_folder=input_model_folder,
37 |                 output_syn_data_folder=output_syn_data_folder,
38 |                 log_folder=log_folder)
39 | 


--------------------------------------------------------------------------------
/netshare/pre_post_processors/__init__.py:
--------------------------------------------------------------------------------
1 | from .pre_post_processor import PrePostProcessor
2 | from .netshare.netshare_pre_post_processor import NetsharePrePostProcessor
3 | from .dg_row_per_sample_pre_post_processor import DGRowPerSamplePrePostProcessor
4 | 
5 | __all__ = [
6 |     'PrePostProcessor',
7 |     'NetsharePrePostProcessor',
8 |     'DGRowPerSamplePrePostProcessor']
9 | 


--------------------------------------------------------------------------------
/netshare/pre_post_processors/dg_row_per_sample_pre_post_processor.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import pickle
  4 | import os
  5 | import csv
  6 | from tqdm import tqdm
  7 | 
  8 | from .pre_post_processor import PrePostProcessor
  9 | from netshare.utils.field import ContinuousField, DiscreteField
 10 | from netshare.utils.output import Normalization
 11 | 
 12 | EPS = 1e-8
 13 | 
 14 | 
 15 | class DGRowPerSamplePrePostProcessor(PrePostProcessor):
 16 |     def _pre_process(self, input_folder, output_folder, log_folder):
 17 |         # input is a file path
 18 |         file_path = input_folder
 19 | 
 20 |         original_df = pd.read_csv(file_path)
 21 | 
 22 |         # Remove missing rows.
 23 |         original_df.dropna(inplace=True)
 24 | 
 25 |         # Parse data.
 26 |         metadata_numpys = []
 27 |         metadata_fields = []
 28 |         for i, field in enumerate(self._config.metadata):
 29 |             if not isinstance(field.column, str):
 30 |                 raise ValueError('"column" should be a string')
 31 |             this_df = original_df[field.column].astype(str)
 32 |             if 'regex' in field:
 33 |                 this_df = this_df.str.extract(field.regex, expand=False)
 34 |             if field.type == 'string':
 35 |                 choices = list(pd.unique(this_df))
 36 |                 field_instance = DiscreteField(
 37 |                     choices=choices,
 38 |                     name=getattr(field, 'name', field.column))
 39 |                 this_numpy = field_instance.normalize(this_df.to_numpy())
 40 |             elif field.type == 'float':
 41 |                 this_df = this_df.astype(np.float64)
 42 |                 this_numpy = this_df.to_numpy()
 43 |                 this_numpy = this_numpy.reshape((this_df.shape[0], 1))
 44 |                 field_instance = ContinuousField(
 45 |                     norm_option=getattr(Normalization, field.normalization),
 46 |                     min_x=this_numpy.min() - EPS,
 47 |                     max_x=this_numpy.max() + EPS,
 48 |                     dim_x=1,
 49 |                     name=getattr(field, 'name', field.column))
 50 |                 this_numpy = field_instance.normalize(this_numpy)
 51 |             else:
 52 |                 raise ValueError(f'Unknown field type {field.type}')
 53 |             metadata_numpys.append(this_numpy)
 54 |             metadata_fields.append(field_instance)
 55 |         metadata_numpy = np.concatenate(
 56 |             metadata_numpys, axis=1).astype(np.float64)
 57 |         print(f'List of metadata: '
 58 |               f'{list((k.dtype, k.shape) for k in metadata_numpys)}')
 59 |         print(f'Metadata type: {metadata_numpy.dtype}, '
 60 |               f'shape: {metadata_numpy.shape}')
 61 | 
 62 |         timeseries_numpys = []
 63 |         timeseries_fields = []
 64 |         for i, field in enumerate(self._config.timeseries):
 65 |             if not isinstance(field.columns, list):
 66 |                 raise ValueError('"columns" should be a list')
 67 |             this_df = original_df[field.columns].astype(str)
 68 |             if 'regex' in field:
 69 |                 for column in field.columns:
 70 |                     this_df[column] = this_df[column].str.extract(
 71 |                         field.regex, expand=False)
 72 |             if field.type == 'string':
 73 |                 choices = list(pd.unique(this_df.values.ravel('K')))
 74 |                 field_instance = DiscreteField(
 75 |                     choices=choices,
 76 |                     name=getattr(field, 'name', field.columns))
 77 |                 this_numpy = field_instance.normalize(this_df.to_numpy())
 78 |                 this_numpy = this_numpy.reshape(
 79 |                     (this_df.shape[0], len(field.columns), len(choices)))
 80 |             elif field.type == 'float':
 81 |                 this_df = this_df.astype(np.float64)
 82 |                 this_numpy = this_df.to_numpy()
 83 |                 this_numpy = this_numpy.reshape(
 84 |                     (this_df.shape[0], len(field.columns), 1))
 85 |                 if getattr(field, 'log1p_norm', False):
 86 |                     this_numpy = np.log1p(this_numpy)
 87 |                 field_instance = ContinuousField(
 88 |                     norm_option=getattr(Normalization, field.normalization),
 89 |                     min_x=this_numpy.min() - EPS,
 90 |                     max_x=this_numpy.max() + EPS,
 91 |                     dim_x=1,
 92 |                     name=getattr(field, 'name', field.columns))
 93 |                 this_numpy = field_instance.normalize(this_numpy)
 94 |             else:
 95 |                 raise ValueError(f'Unknown field type {field.type}')
 96 |             timeseries_numpys.append(this_numpy)
 97 |             timeseries_fields.append(field_instance)
 98 |         timeseries_numpy = np.concatenate(timeseries_numpys, axis=2).astype(
 99 |             np.float64)
100 |         print(f'List of timeseries: '
101 |               f'{list((k.dtype, k.shape) for k in timeseries_numpys)}')
102 |         print(f'Timeseries type: {timeseries_numpy.dtype}, '
103 |               f'shape: {timeseries_numpy.shape}')
104 | 
105 |         # Randomly select the required number of samples.
106 |         np.random.seed(getattr(self._config, 'random_seed', 0))
107 |         ids = np.random.permutation(metadata_numpy.shape[0])
108 |         metadata_train_numpy = metadata_numpy[
109 |             ids[:self._config.num_train_samples]]
110 |         timeseries_train_numpy = timeseries_numpy[
111 |             ids[:self._config.num_train_samples]]
112 | 
113 |         print(f'Metadata train type: {metadata_train_numpy.dtype}, '
114 |               f'shape: {metadata_train_numpy.shape}')
115 |         print(f'Timeseries train type: {timeseries_train_numpy.dtype}, '
116 |               f'shape: {timeseries_train_numpy.shape}')
117 | 
118 |         # Write files
119 |         with open(os.path.join(
120 |                 output_folder, 'data_attribute_output.pkl'), 'wb') as f:
121 |             pickle.dump([v.getOutputType() for v in metadata_fields], f)
122 |         with open(os.path.join(
123 |                 output_folder, 'data_feature_output.pkl'), 'wb') as f:
124 |             pickle.dump([v.getOutputType() for v in timeseries_fields], f)
125 |         with open(os.path.join(
126 |                 output_folder, 'data_attribute_fields.pkl'), 'wb') as f:
127 |             pickle.dump(metadata_fields, f)
128 |         with open(os.path.join(
129 |                 output_folder, 'data_feature_fields.pkl'), 'wb') as f:
130 |             pickle.dump(timeseries_fields, f)
131 |         npz_folder = os.path.join(output_folder, 'data_train_npz')
132 |         os.makedirs(npz_folder)
133 |         for i in range(metadata_train_numpy.shape[0]):
134 |             np.savez(
135 |                 os.path.join(npz_folder, f'data_train_{i}.npz'),
136 |                 data_feature=timeseries_train_numpy[i],
137 |                 data_attribute=metadata_train_numpy[i],
138 |                 data_gen_flag=np.ones(timeseries_train_numpy.shape[1]),
139 |                 global_max_flow_len=[timeseries_train_numpy.shape[1]])
140 | 
141 |         return True
142 | 
143 |     def _post_process(self, input_folder, output_folder,
144 |                       pre_processed_data_folder, log_folder):
145 |         with open(os.path.join(
146 |                 pre_processed_data_folder,
147 |                 'data_attribute_fields.pkl'), 'rb') as f:
148 |             metadata_fields = pickle.load(f)
149 |         with open(os.path.join(
150 |                 pre_processed_data_folder,
151 |                 'data_feature_fields.pkl'), 'rb') as f:
152 |             timeseries_fields = pickle.load(f)
153 |         sub_folders = os.listdir(input_folder)
154 |         for sub_folder in sub_folders:
155 |             data_path = os.path.join(input_folder, sub_folder, 'data.npz')
156 |             data = np.load(data_path)
157 |             unnormalized_timeseries = data['data_feature']
158 |             unnormalized_metadata = data['data_attribute']
159 |             data_gen_flag = data['data_gen_flag']
160 |             timeseries = []
161 |             metadata = []
162 |             dim = 0
163 |             for field_i, field in enumerate(metadata_fields):
164 |                 sub_metadata = field.denormalize(
165 |                     unnormalized_metadata[
166 |                         :, dim: dim + field.getOutputType().dim])
167 |                 if getattr(self._config.metadata[field_i], 'log1p_norm',
168 |                            False):
169 |                     sub_metadata = np.exp(sub_metadata) - 1
170 |                 if isinstance(field, ContinuousField):
171 |                     sub_metadata = sub_metadata[:, 0]
172 |                 metadata.append(sub_metadata)
173 |                 dim += field.getOutputType().dim
174 |             assert dim == unnormalized_metadata.shape[1]
175 | 
176 |             timeseries = []
177 |             dim = 0
178 |             for field_i, field in enumerate(timeseries_fields):
179 |                 sub_timeseries = field.denormalize(
180 |                     unnormalized_timeseries[
181 |                         :, :, dim: dim + field.getOutputType().dim])
182 |                 if getattr(self._config.timeseries[field_i], 'log1p_norm',
183 |                            False):
184 |                     sub_timeseries = np.exp(sub_timeseries) - 1
185 |                 if isinstance(field, ContinuousField):
186 |                     sub_timeseries = sub_timeseries[:, :, 0]
187 |                 timeseries.append(sub_timeseries)
188 |                 dim += field.getOutputType().dim
189 |             assert dim == unnormalized_timeseries.shape[2]
190 | 
191 |             csv_folder = os.path.join(output_folder, sub_folder)
192 |             os.makedirs(csv_folder)
193 |             csv_path = os.path.join(csv_folder, 'data.csv')
194 |             with open(csv_path, 'w') as f:
195 |                 writer = csv.writer(f)
196 |                 writer.writerow(
197 |                     [field.name for field in metadata_fields] +
198 |                     [column_name for field in timeseries_fields
199 |                      for column_name in field.name])
200 |                 for i in tqdm(range(unnormalized_timeseries.shape[0])):
201 |                     writer.writerow(
202 |                         [d[i] for d in metadata] +
203 |                         [sd
204 |                          for d in timeseries
205 |                          for sd in d[i][:int(np.sum(data_gen_flag[i]))]])
206 |         return True
207 | 


--------------------------------------------------------------------------------
/netshare/pre_post_processors/netshare/README.md:
--------------------------------------------------------------------------------
1 | In case of any change to the `main.c` and `packet.h` file, please use `sharedlib.sh` to create new shared library.


--------------------------------------------------------------------------------
/netshare/pre_post_processors/netshare/choose_best_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | from scipy.stats import rankdata
  8 | from .util import create_sdmetrics_config, convert_sdmetricsConfigQuant_to_fieldValueDict
  9 | from sdmetrics.reports.timeseries import QualityReport
 10 | 
 11 | 
 12 | def compare_rawdf_syndfs(
 13 |     raw_df,
 14 |     syn_dfs,
 15 |     config_pre_post_processor
 16 | ):
 17 |     # Compare raw_df and syn_dfs and return the best syn_df
 18 |     sdmetrics_config = create_sdmetrics_config(
 19 |         config_pre_post_processor,
 20 |         comparison_type='quantitative')
 21 |     report = QualityReport(config_dict=sdmetrics_config['config'])
 22 | 
 23 |     metrics_dict_list = []
 24 |     for syn_df in syn_dfs:
 25 |         report.generate(
 26 |             raw_df, syn_df, sdmetrics_config['metadata'])
 27 |         metrics_dict_list.append(
 28 |             convert_sdmetricsConfigQuant_to_fieldValueDict(
 29 |                 report.dict_metric_scores))
 30 | 
 31 |     metrics = list(metrics_dict_list[0].keys())
 32 |     metric_vals_dict = {}
 33 |     for metrics_dict in metrics_dict_list:
 34 |         for metric in metrics:
 35 |             if metric not in metric_vals_dict:
 36 |                 metric_vals_dict[metric] = []
 37 |             metric_vals_dict[metric].append(metrics_dict[metric])
 38 |     metric_vals_2d = []
 39 |     for metric, vals in metric_vals_dict.items():
 40 |         metric_vals_2d.append(vals)
 41 |     rankings_sum = np.sum(rankdata(metric_vals_2d, axis=1), axis=0)
 42 |     best_syndf_idx = np.argmin(rankdata(rankings_sum))
 43 | 
 44 |     return best_syndf_idx, syn_dfs[best_syndf_idx]
 45 | 
 46 | 
 47 | def choose_best_model(
 48 |     config_pre_post_processor,
 49 |     pre_processed_data_folder,
 50 |     generated_data_folder,
 51 |     post_processed_data_folder
 52 | ):
 53 |     with open(os.path.join(generated_data_folder, "configs_generate.json"), 'r') as f:
 54 |         data = json.load(f)
 55 |         configs = data["configs"]
 56 |         # add pre_processor configs in place
 57 |         for config in configs:
 58 |             config.update(config_pre_post_processor)
 59 |         config_group_list = data["config_group_list"]
 60 | 
 61 |     # TODO: change to distribute (Ray-style)
 62 |     dict_dataset_syndfs = {}
 63 |     for config_group_idx, config_group in enumerate(config_group_list):
 64 |         print("Config group #{}: {}".format(config_group_idx, config_group))
 65 |         config_ids = config_group["config_ids"]
 66 |         chunk0_idx = config_ids[0]
 67 |         syndf_root_folder = os.path.join(
 68 |             configs[chunk0_idx]["eval_root_folder"], "syn_dfs"
 69 |         )
 70 |         assert len(
 71 |             [
 72 |                 file
 73 |                 for file in os.listdir(syndf_root_folder)
 74 |                 if file.startswith("chunk_id")
 75 |             ]
 76 |         ) == len(config_ids)
 77 | 
 78 |         best_syndfs = []
 79 |         truncate_ratios = []
 80 |         for chunk_id, config_idx in enumerate(config_ids):
 81 |             config = configs[config_idx]
 82 |             raw_df = pd.read_csv(os.path.join(config["dataset"], "raw.csv"))
 83 |             time_col_name = getattr(
 84 |                 getattr(config_pre_post_processor, 'timestamp'),
 85 |                 'column')
 86 | 
 87 |             syn_dfs = []
 88 |             syn_dfs_names = []
 89 |             syn_df_folder = os.path.join(
 90 |                 syndf_root_folder, "chunk_id-{}".format(chunk_id)
 91 |             )
 92 |             for file in os.listdir(syn_df_folder):
 93 |                 if file.endswith(".csv"):
 94 |                     syn_dfs_names.append(file)
 95 |                     syn_df = pd.read_csv(os.path.join(syn_df_folder, file))
 96 | 
 97 |                     # truncate to raw data time range
 98 |                     if config["truncate"] == "per_chunk":
 99 |                         syn_df_truncated = syn_df[
100 |                             (syn_df[time_col_name] >= raw_df[time_col_name].min())
101 |                             & (syn_df[time_col_name] <= raw_df[time_col_name].max())
102 |                         ]
103 |                     # TODO: support more truncation methods if necessary
104 |                     else:
105 |                         raise ValueError("Unknown truncation methods...")
106 |                     truncate_ratios.append(
107 |                         1.0 - len(syn_df_truncated) / len(syn_df))
108 | 
109 |                     syn_dfs.append(syn_df_truncated)
110 | 
111 |             best_syndf_idx, best_syndf = compare_rawdf_syndfs(
112 |                 raw_df[syn_dfs[0].columns], syn_dfs, config_pre_post_processor
113 |             )
114 | 
115 |             best_syndfs.append(best_syndf)
116 |             print(
117 |                 "Chunk_id: {}, # of syn dfs: {}, best_syndf: {}".format(
118 |                     chunk_id, len(syn_dfs), syn_dfs_names[best_syndf_idx]
119 |                 )
120 |             )
121 | 
122 |         print("Average truncation ratio:", np.mean(truncate_ratios))
123 |         big_best_syndf = pd.concat(best_syndfs)
124 |         print("Big syndf shape:", big_best_syndf.shape)
125 |         print()
126 | 
127 |         if config_group["dp_noise_multiplier"] not in dict_dataset_syndfs:
128 |             dict_dataset_syndfs[config_group["dp_noise_multiplier"]] = []
129 |         dict_dataset_syndfs[config_group["dp_noise_multiplier"]].append(
130 |             big_best_syndf)
131 | 
132 |         dict_dataset_bestsyndf = {}
133 | 
134 |     big_raw_df = pd.read_csv(os.path.join(pre_processed_data_folder, "raw.csv"))
135 |     for dpnoisemultiplier, syn_dfs in dict_dataset_syndfs.items():
136 |         assert len(syn_dfs) >= 1
137 |         if len(syn_dfs) > 1:
138 |             best_syndf_idx, best_syn_df = compare_rawdf_syndfs(
139 |                 big_raw_df[syn_dfs[0].columns],
140 |                 syn_dfs, config_pre_post_processor)
141 |             dict_dataset_bestsyndf[dpnoisemultiplier] = best_syn_df
142 |         else:
143 |             dict_dataset_bestsyndf[dpnoisemultiplier] = syn_dfs[0]
144 | 
145 |     print("Aggregated final dataset syndf")
146 |     for dp_noise_multiplier, best_syndf in dict_dataset_bestsyndf.items():
147 |         print(dp_noise_multiplier, best_syndf.shape)
148 |         best_syndf_folder = post_processed_data_folder
149 |         os.makedirs(best_syndf_folder, exist_ok=True)
150 | 
151 |         # find best syndf index i.e., for evaluation fairness
152 |         cur_max_idx = None
153 |         for file in os.listdir(best_syndf_folder):
154 |             if file.startswith(
155 |                 "syn_df,dp_noise_multiplier-{},truncate-{},id-".format(
156 |                     dp_noise_multiplier, config["truncate"]
157 |                 )
158 |             ):
159 |                 this_id = int(os.path.splitext(file)[
160 |                               0].split(",")[-1].split("-")[1])
161 |                 if cur_max_idx is None or this_id > cur_max_idx:
162 |                     cur_max_idx = this_id
163 |         if cur_max_idx is None:
164 |             cur_max_idx = 1
165 |         else:
166 |             cur_max_idx += 1
167 | 
168 |         best_syndf_filename = os.path.join(
169 |             best_syndf_folder,
170 |             "syn_df,dp_noise_multiplier-{},truncate-{},id-{}.csv".format(
171 |                 dp_noise_multiplier, config["truncate"], cur_max_idx)
172 |         )
173 |         # best_syndf_filename = os.path.join(best_syndf_folder, "syn.csv")
174 | 
175 |         print("best_syn_df filename:", best_syndf_filename)
176 | 
177 |         # sort by timestamp if applicable
178 |         if config_pre_post_processor.timestamp.generation:
179 |             time_col_name = config_pre_post_processor.timestamp.column
180 |         best_syndf = best_syndf.sort_values(time_col_name)
181 |         best_syndf.to_csv(best_syndf_filename, index=False)
182 | 


--------------------------------------------------------------------------------
/netshare/pre_post_processors/netshare/denormalize_fields.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | import json
  4 | import random
  5 | import pickle
  6 | from typing import Dict, List
  7 | 
  8 | import numpy as np
  9 | from config_io import Config
 10 | from tqdm import tqdm
 11 | 
 12 | from netshare.utils.logger import logger
 13 | 
 14 | 
 15 | def _get_fields_names(fields_list):
 16 |     """
 17 |     This function returns the names of the given fields.
 18 |     """
 19 |     field_names = []
 20 |     for field in fields_list:
 21 |         if isinstance(field.name, list):
 22 |             field_names.extend(field.name)
 23 |         else:
 24 |             field_names.append(field.name)
 25 |     return field_names
 26 | 
 27 | 
 28 | def _denormalize_by_fields_list(
 29 |     normalized_data,
 30 |     fields_list,
 31 |     is_session_key
 32 | ):
 33 |     """
 34 |     This function executes field.denormalize for each of the given field.
 35 |     """
 36 |     denormalized_data = []
 37 |     dim = 0
 38 | 
 39 |     for field in fields_list:
 40 |         if is_session_key:
 41 |             sub_data = normalized_data[:, dim: dim + field.dim_x]
 42 |         else:
 43 |             sub_data = normalized_data[:, :, dim: dim + field.dim_x]
 44 | 
 45 |         sub_data = field.denormalize(sub_data)
 46 | 
 47 |         # For session key, if shape looks like (n, ), change it to (n, 1) for consistency
 48 |         if is_session_key and len(sub_data.shape) == 1:
 49 |             sub_data = np.expand_dims(sub_data, axis=1)
 50 |         # For timeseries, if shape looks like (i, j), change it to (i, j, 1) for consistency
 51 |         if not is_session_key and len(sub_data.shape) == 2:
 52 |             sub_data = np.expand_dims(sub_data, axis=2)
 53 |         denormalized_data.append(sub_data)
 54 |         dim += field.dim_x
 55 |     return denormalized_data
 56 | 
 57 | 
 58 | def write_to_csv(
 59 |     csv_folder,
 60 |     session_key_fields,
 61 |     timeseries_fields,
 62 |     session_key,
 63 |     timeseries,
 64 |     data_gen_flag,
 65 |     filename,
 66 |     config,
 67 | ) -> None:
 68 |     """
 69 |     This function dumps the given data to the given directory as a csv format.
 70 |     `data_gen_flag` is an indicator showing if the time series for this session
 71 |     has ended in this time step.
 72 |     """
 73 |     os.makedirs(csv_folder, exist_ok=True)
 74 |     csv_path = os.path.join(csv_folder, filename)
 75 |     # change session key shape to #session * #attributes
 76 |     session_key_numpy = np.array(np.concatenate(session_key, axis=1))
 77 |     # change timeseries shape to #session * #time_steps * #features
 78 |     timeseries_numpy = np.array(np.concatenate(timeseries, axis=2))
 79 | 
 80 |     with open(csv_path, "w") as f:
 81 |         writer = csv.writer(f)
 82 |         raw_metadata_field_names = [
 83 |             col.column for col in (config["metadata"])
 84 |         ]
 85 |         raw_timeseries_filed_names = [
 86 |             col.column for col in config["timeseries"]]
 87 |         session_titles = [
 88 |             f for i, f in enumerate(_get_fields_names(session_key_fields))
 89 |             if f in raw_metadata_field_names]
 90 |         session_titles_idx = [
 91 |             i for i, f in enumerate(_get_fields_names(session_key_fields))
 92 |             if f in raw_metadata_field_names]
 93 |         timeseries_titles = [
 94 |             f
 95 |             for i, f in enumerate(_get_fields_names(timeseries_fields))
 96 |             if f in raw_timeseries_filed_names
 97 |         ]
 98 |         timeseries_titles_idx = [
 99 |             i
100 |             for i, f in enumerate(_get_fields_names(timeseries_fields))
101 |             if f in raw_timeseries_filed_names
102 |         ]
103 | 
104 |         if config["timestamp"].get("generation", False):
105 |             timeseries_titles.append(config["timestamp"]["column"])
106 |             if config["timestamp"]["encoding"] == "interarrival":
107 |                 # Find `flow_start` and `interarrival_within_flow` index
108 |                 flow_start_idx, interarrival_within_flow_idx = None, None
109 |                 for idx, field_name in enumerate(
110 |                         _get_fields_names(session_key_fields)):
111 |                     if field_name == "flow_start":
112 |                         flow_start_idx = idx
113 |                         break
114 |                 for idx, field_name in enumerate(
115 |                         _get_fields_names(timeseries_fields)):
116 |                     if field_name == "interarrival_within_flow":
117 |                         interarrival_within_flow_idx = idx
118 |                         break
119 |                 if flow_start_idx is None or interarrival_within_flow_idx is None:
120 |                     raise ValueError(
121 |                         "Using `interarrival` encoding: `flow_start` or `interarrival_field` not found!"
122 |                     )
123 | 
124 |                 # convert interarrival to raw timestamp
125 |                 interarrival_cumsum = np.cumsum(
126 |                     timeseries_numpy[:, :, interarrival_within_flow_idx].astype(
127 |                         float),
128 |                     axis=1)
129 |                 # first packet has 0.0 interarrival
130 |                 interarrival_cumsum[:, 0] = 0.0
131 |                 flow_start_expand = (
132 |                     np.array(
133 |                         [
134 |                             session_key_numpy[:, flow_start_idx],
135 |                         ]
136 |                         * interarrival_cumsum.shape[1]
137 |                     )
138 |                     .transpose()
139 |                     .astype(float)
140 |                 )
141 |                 timestamp_matrix = np.expand_dims(
142 |                     np.add(flow_start_expand, interarrival_cumsum), axis=2
143 |                 )
144 |                 timeseries_numpy = np.concatenate(
145 |                     (timeseries_numpy, timestamp_matrix), axis=2
146 |                 )
147 |                 timeseries_titles_idx.append(timeseries_numpy.shape[2] - 1)
148 | 
149 |         writer.writerow(session_titles + timeseries_titles)
150 | 
151 |         session_key_set = set()
152 |         for (
153 |             data_gen_per_session,
154 |             session_data_per_session,
155 |             timeseries_per_session,
156 |         ) in zip(
157 |             data_gen_flag,
158 |             # remove cols not in raw data
159 |             session_key_numpy[:, session_titles_idx],
160 |             timeseries_numpy[
161 |                 :, :, timeseries_titles_idx
162 |             ],  # remove cols not in raw data
163 |         ):
164 |             session_data_per_session = session_data_per_session.tolist()
165 |             # remove duplicated session keys
166 |             if tuple(session_data_per_session) in session_key_set:
167 |                 logger.debug(
168 |                     "Session key {session_data_per_session} already exists!")
169 |                 continue
170 |             session_key_set.add(tuple(session_data_per_session))
171 |             for j in range(data_gen_per_session.shape[0]):
172 |                 if data_gen_per_session[j] == 1.0:
173 |                     timeseries_data = timeseries_per_session[j].tolist()
174 |                     writer.writerow(session_data_per_session + timeseries_data)
175 | 
176 | 
177 | def denormalize_fields(
178 |     config_pre_post_processor,
179 |     pre_processed_data_folder,
180 |     generated_data_folder,
181 |     post_processed_data_folder
182 | ):
183 |     """
184 |     This function denormalizes the data in the generated_data folder using the attributes and features fields that were created in the pre-process step.
185 |     Last, it writes the denormalized data to a csv file under the same directory hierarchy as the created data.
186 | 
187 |     :return: the path to the denormalized data.
188 |     """
189 |     with open(os.path.join(generated_data_folder, "configs_generate.json"), 'r') as f:
190 |         data = json.load(f)
191 |         configs = data["configs"]
192 |         config_group_list = data["config_group_list"]
193 | 
194 |     for config in tqdm(configs):
195 |         with open(os.path.join(
196 |             pre_processed_data_folder,
197 |             f"chunkid-{config['chunk_id']}",
198 |             "data_attribute_fields.pkl"
199 |         ), 'rb') as f:
200 |             session_key_fields = list(pickle.load(f))
201 | 
202 |         with open(os.path.join(
203 |             pre_processed_data_folder,
204 |             f"chunkid-{config['chunk_id']}",
205 |             "data_feature_fields.pkl"
206 |         ), 'rb') as f:
207 |             timeseries_fields = list(pickle.load(f))
208 | 
209 |         # Each configuration has multiple iteration ckpts
210 |         per_chunk_basedir = os.path.join(
211 |             config["eval_root_folder"],
212 |             "feat_raw", f"chunk_id-{config['chunk_id']}")
213 |         for f in os.listdir(per_chunk_basedir):
214 |             if not f.endswith(".npz"):
215 |                 continue
216 |             data = np.load(os.path.join(per_chunk_basedir, f))
217 |             unnormalized_session_key = data["data_attribute"]
218 |             unnormalized_timeseries = data["data_feature"]
219 |             data_gen_flag = data["data_gen_flag"]
220 | 
221 |             session_key = _denormalize_by_fields_list(
222 |                 unnormalized_session_key, session_key_fields,
223 |                 is_session_key=True)
224 |             timeseries = _denormalize_by_fields_list(
225 |                 unnormalized_timeseries, timeseries_fields, is_session_key=False
226 |             )
227 | 
228 |             csv_root_folder = os.path.join(
229 |                 config["eval_root_folder"], "syn_dfs")
230 |             csv_filename = f.replace(".npz", ".csv")
231 |             write_to_csv(
232 |                 csv_folder=os.path.join(
233 |                     csv_root_folder, f"chunk_id-{config['chunk_id']}"
234 |                 ),
235 |                 session_key_fields=session_key_fields,
236 |                 timeseries_fields=timeseries_fields,
237 |                 session_key=session_key,
238 |                 timeseries=timeseries,
239 |                 data_gen_flag=data_gen_flag,
240 |                 filename=csv_filename,
241 |                 config=config_pre_post_processor,
242 |             )
243 | 


--------------------------------------------------------------------------------
/netshare/pre_post_processors/netshare/dist_metrics.py:
--------------------------------------------------------------------------------
  1 | from .embedding_helper import build_annoy_dictionary_word2vec, get_original_obj
  2 | from netshare.utils import ContinuousField, DiscreteField, BitField
  3 | from netshare.utils import Normalization
  4 | from netshare.utils import Tee, Output
  5 | from scipy.spatial import distance
  6 | from scipy.stats import wasserstein_distance
  7 | from collections import Counter, OrderedDict
  8 | from gensim.models import Word2Vec
  9 | from tqdm import tqdm
 10 | import statsmodels.api as sm
 11 | import pandas as pd
 12 | import matplotlib.pyplot as plt
 13 | import matplotlib
 14 | import numpy as np
 15 | import sys
 16 | import configparser
 17 | import json
 18 | import random
 19 | import copy
 20 | import math
 21 | import os
 22 | import pickle
 23 | random.seed(42)
 24 | 
 25 | 
 26 | # avoid type3 fonts
 27 | matplotlib.rcParams['pdf.fonttype'] = 42
 28 | matplotlib.rcParams['ps.fonttype'] = 42
 29 | matplotlib.rcParams.update({'font.size': 15})
 30 | 
 31 | # color-blindness friendly
 32 | CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
 33 |                   '#f781bf', '#a65628', '#984ea3',
 34 |                   '#999999', '#e41a1c', '#dede00']
 35 | # colors = {
 36 | #     'blue':   [55,  126, 184],  #377eb8
 37 | #     'orange': [255, 127, 0],    #ff7f00
 38 | #     'green':  [77,  175, 74],   #4daf4a
 39 | #     'pink':   [247, 129, 191],  #f781bf
 40 | #     'brown':  [166, 86,  40],   #a65628
 41 | #     'purple': [152, 78,  163],  #984ea3
 42 | #     'gray':   [153, 153, 153],  #999999
 43 | #     'red':    [228, 26,  28],   #e41a1c
 44 | #     'yellow': [222, 222, 0]     #dede00
 45 | # }
 46 | 
 47 | # https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml
 48 | dict_pr_str2int = {
 49 |     "ESP": 50,
 50 |     "GRE": 47,
 51 |     "ICMP": 1,
 52 |     "IPIP": 4,
 53 |     "IPv6": 41,
 54 |     "TCP": 6,
 55 |     "UDP": 17,
 56 |     "RSVP": 46,
 57 |     "Other": 255,
 58 |     "255": 255,  # TEMP
 59 | }
 60 | 
 61 | 
 62 | # jsd
 63 | def jsd(p, q, type):
 64 |     p = list(p)
 65 |     q = list(q)
 66 | 
 67 |     if type == "discrete":
 68 |         # append 0 to shorter arrays: only for IP
 69 |         pq_max_len = max(len(p), len(q))
 70 |         p += [0.0] * (pq_max_len - len(p))
 71 |         q += [0.0] * (pq_max_len - len(q))
 72 |         assert (len(p) == len(q))
 73 |         return distance.jensenshannon(p, q)**2
 74 | 
 75 |     elif type == "continuous":
 76 |         # min_ = min(min(p), min(q))
 77 |         # max_ = max(max(p), max(q))
 78 | 
 79 |         min_ = min(p)
 80 |         max_ = max(p)
 81 | 
 82 |         # assume p is raw data
 83 |         # compute n_bins by FD on raw data; use across baselines
 84 |         p_counts, p_bin_edges = np.histogram(
 85 |             p, range=(min_, max_), bins="auto")
 86 |         q_counts, q_bin_edges = np.histogram(
 87 |             q, range=(min_, max_), bins=len(p_counts))
 88 | 
 89 |         # out of range
 90 |         q_arr = np.array(q)
 91 |         q_arr_lt_realmin = q_arr[q_arr < min_]
 92 |         q_arr_gt_realmax = q_arr[q_arr > max_]
 93 | 
 94 |         if len(q_arr_lt_realmin) > 0:
 95 |             np.insert(q_counts, 0, len(q_arr_lt_realmin))
 96 |             np.insert(p_counts, 0, 0.0)
 97 |         if len(q_arr_gt_realmax) > 0:
 98 |             np.append(q_counts, len(q_arr_gt_realmax))
 99 |             np.append(p_counts, 0.0)
100 | 
101 |         return distance.jensenshannon(p_counts, q_counts)**2
102 | 
103 |     else:
104 |         raise ValueError("Unknown JSD data type")
105 | 
106 | 
107 | def compute_IP_rank_distance(real_list, syn_list, type="EMD"):
108 |     real_HH_count = OrderedDict(Counter(real_list).most_common())
109 |     syn_HH_count = OrderedDict(Counter(syn_list).most_common())
110 | 
111 |     real_rank_list = []
112 |     idx = 1
113 |     for k, v in real_HH_count.items():
114 |         real_rank_list += [idx] * v
115 |         idx += 1
116 | 
117 |     syn_rank_list = []
118 |     idx = 1
119 |     for k, v in syn_HH_count.items():
120 |         syn_rank_list += [idx] * v
121 |         idx += 1
122 | 
123 |     if type == "EMD":
124 |         return wasserstein_distance(real_rank_list, syn_rank_list)
125 |     elif type == "JSD":
126 |         return jsd(real_HH_count.values(),
127 |                    syn_HH_count.values(), type="discrete")
128 |     else:
129 |         raise ValueError("Unknown distance metric!")
130 | 
131 | # type == "freq": return the freq dict
132 | 
133 | 
134 | def compute_port_proto_distance(
135 |         real_list, syn_list, opt, prstr_raw=True, prstr_syn=True, type="TV"):
136 |     real_list = list(real_list)
137 |     syn_list = list(syn_list)
138 | 
139 |     # TCP: 6
140 |     # UDP: 17
141 |     # Other: 255, used for binning other protocols
142 |     if opt == "proto":
143 |         # convert to integer if protocol is string (e.g., "TCP"/"UDP")
144 |         if isinstance(real_list[0], str):
145 |             real_list_numeric = []
146 |             for i in real_list:
147 |                 i = i.strip()
148 |                 real_list_numeric.append(dict_pr_str2int[i.upper()])
149 |             real_list = real_list_numeric
150 | 
151 |         if isinstance(syn_list[0], str):
152 |             syn_list_numeric = []
153 |             for i in syn_list:
154 |                 i = i.strip()
155 |                 syn_list_numeric.append(dict_pr_str2int[i.upper()])
156 |             syn_list = syn_list_numeric
157 | 
158 |     if opt == "srcport" or opt == "dstport":
159 |         real_dict = {}
160 |         syn_dict = {}
161 |         for i in range(65536):
162 |             real_dict[i] = 0
163 |             syn_dict[i] = 0
164 |         for i in real_list:
165 |             real_dict[int(i)] += float(1 / len(real_list))
166 |         for i in syn_list:
167 |             if i < 0:
168 |                 i = 0
169 |             elif i > 65535:
170 |                 i = 65535
171 |             syn_dict[int(i)] += float(1 / len(syn_list))
172 | 
173 |         if type == "TV":
174 |             tv_distance = 0
175 |             for i in range(65536):
176 |                 tv_distance += 0.5 * abs(real_dict[i] - syn_dict[i])
177 |             return tv_distance
178 |         elif type == "JSD":
179 |             return jsd(real_dict.values(), syn_dict.values(), type="discrete")
180 |         elif type == "freq":
181 |             return real_dict, syn_dict
182 |         else:
183 |             raise ValueError("Unknown distance metric!")
184 | 
185 |     elif opt == "proto":
186 |         real_dict = {}
187 |         syn_dict = {}
188 |         for i in range(256):
189 |             real_dict[i] = 0
190 |             syn_dict[i] = 0
191 |         for i in real_list:
192 |             real_dict[int(i)] += float(1 / len(real_list))
193 |         for i in syn_list:
194 |             syn_dict[int(i)] += float(1 / len(syn_list))
195 | 
196 |         if type == "TV":
197 |             tv_distance = 0
198 |             for i in range(256):
199 |                 tv_distance += 0.5 * abs(real_dict[i] - syn_dict[i])
200 |             return tv_distance
201 |         elif type == "JSD":
202 |             return jsd(real_dict.values(), syn_dict.values(), type="discrete")
203 |         elif type == "freq":
204 |             return real_dict, syn_dict
205 |         else:
206 |             raise ValueError("Unknown distance metric!")
207 | 
208 | 
209 | def get_flowduration(df):
210 |     df = df.sort_values("time")
211 | 
212 |     metadata = ["srcip", "dstip", "srcport", "dstport", "proto"]
213 |     gk = df.groupby(by=metadata)
214 | 
215 |     flow_duration_list = []
216 | 
217 |     for name, group in gk:
218 |         time_list = list(group["time"])
219 |         flow_duration_list.append(time_list[-1] - time_list[0])
220 | 
221 |     return flow_duration_list
222 | 
223 | 
224 | def compute_metrics_netflow_v3(raw_df, syn_df):
225 |     '''JSD + EMD + ranking'''
226 |     metrics_dict = {}
227 | 
228 |     # IP popularity rank
229 |     for metric in ["srcip", "dstip"]:
230 |         metrics_dict[metric] = compute_IP_rank_distance(
231 |             raw_df[metric], syn_df[metric], type="JSD")
232 | 
233 |     # TV distance for port/protocol
234 |     for metric in ["srcport", "dstport", "proto"]:
235 |         metrics_dict[metric] = compute_port_proto_distance(
236 |             raw_df[metric],
237 |             syn_df[metric],
238 |             metric, prstr_raw=True, prstr_syn=True, type="JSD")
239 | 
240 |     # ts, td, pkt, byt
241 |     for metric in ["ts", "td", "pkt", "byt"]:
242 |         if metric == "ts":
243 |             raw_df = raw_df.sort_values("ts").reset_index()
244 |             syn_df = syn_df.sort_values("ts").reset_index()
245 |             raw_list = list(raw_df["ts"] - raw_df["ts"][0])
246 |             syn_list = list(syn_df["ts"] - syn_df["ts"][0])
247 |             metrics_dict[metric] = wasserstein_distance(raw_list, syn_list)
248 |         else:
249 |             metrics_dict[metric] = wasserstein_distance(
250 |                 list(raw_df[metric]), list(syn_df[metric]))
251 | 
252 |     return metrics_dict
253 | 
254 | 
255 | def compute_metrics_zeeklog_v3(raw_df, syn_df):
256 |     '''JSD + EMD + ranking'''
257 |     metrics_dict = {}
258 | 
259 |     # IP popularity rank
260 |     for metric in ["srcip", "dstip"]:
261 |         metrics_dict[metric] = compute_IP_rank_distance(
262 |             raw_df[metric], syn_df[metric], type="JSD")
263 | 
264 |     # TV distance for port/protocol
265 |     for metric in ["srcport", "dstport", "proto"]:
266 |         metrics_dict[metric] = compute_port_proto_distance(
267 |             raw_df[metric],
268 |             syn_df[metric],
269 |             metric, prstr_raw=True, prstr_syn=True, type="JSD")
270 | 
271 |     # ts,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,
272 |     # orig_ip_bytes,resp_pkts,resp_ip_bytes
273 |     for metric in ["ts", "duration", "orig_bytes", "resp_bytes", "missed_bytes",
274 |                    "orig_pkts", "orig_ip_bytes", "resp_pkts", "resp_ip_bytes"]:
275 |         if metric == "ts":
276 |             raw_df = raw_df.sort_values("ts").reset_index()
277 |             syn_df = syn_df.sort_values("ts").reset_index()
278 |             raw_list = list(raw_df["ts"] - raw_df["ts"][0])
279 |             syn_list = list(syn_df["ts"] - syn_df["ts"][0])
280 |             metrics_dict[metric] = wasserstein_distance(raw_list, syn_list)
281 |         else:
282 |             metrics_dict[metric] = wasserstein_distance(
283 |                 list(raw_df[metric]), list(syn_df[metric]))
284 | 
285 |     # TODO: Important!! How to define the JSD of service and conn_state?
286 | 
287 |     return metrics_dict
288 | 
289 | 
290 | def compute_metrics_pcap_v3(raw_df, syn_df):
291 |     '''JSD + EMD + ranking'''
292 |     metrics_dict = {}
293 | 
294 |     # IP popularity rank
295 |     for metric in ["srcip", "dstip"]:
296 |         metrics_dict[metric] = compute_IP_rank_distance(
297 |             raw_df[metric], syn_df[metric], type="JSD")
298 | 
299 |     # TV distance for port/protocol
300 |     for metric in ["srcport", "dstport", "proto"]:
301 |         metrics_dict[metric] = compute_port_proto_distance(
302 |             raw_df[metric],
303 |             syn_df[metric],
304 |             metric, prstr_raw=True, prstr_syn=True, type="JSD")
305 | 
306 |     # pkt_len
307 |     for metric in ["pkt_len", "time"]:
308 |         # if metric == "time":
309 |         #     label = "pkt_arrivalTime"
310 |         # else:
311 |         #     label = metric
312 | 
313 |         if metric == "time":
314 |             raw_df = raw_df.sort_values("time").reset_index()
315 |             syn_df = syn_df.sort_values("time").reset_index()
316 |             raw_list = list(raw_df["time"] - raw_df["time"][0])
317 |             syn_list = list(syn_df["time"] - syn_df["time"][0])
318 |             metrics_dict[metric] = wasserstein_distance(raw_list, syn_list)
319 |         else:
320 |             metrics_dict[metric] = wasserstein_distance(
321 |                 list(raw_df[metric]), list(syn_df[metric]))
322 | 
323 |     # interarrival time
324 |     # raw_df = raw_df.sort_values("time")
325 |     # syn_df = syn_df.sort_values("time")
326 |     # metrics_dict["PIAT"] = wasserstein_distance(list(np.diff(raw_df["time"])), list(np.diff(syn_df["time"])))
327 | 
328 |     # flow size distribution
329 |     metadata = ["srcip", "dstip", "srcport", "dstport", "proto"]
330 |     raw_gk = raw_df.groupby(by=metadata)
331 |     syn_gk = syn_df.groupby(by=metadata)
332 | 
333 |     raw_flowsize_list = list(raw_gk.size().values)
334 |     syn_flowsize_list = list(syn_gk.size().values)
335 |     metrics_dict["flow_size"] = wasserstein_distance(
336 |         raw_flowsize_list, syn_flowsize_list)
337 | 
338 |     return metrics_dict
339 | 


--------------------------------------------------------------------------------
/netshare/pre_post_processors/netshare/embedding_helper.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import numpy as np
 3 | 
 4 | from annoy import AnnoyIndex
 5 | from gensim.models import Word2Vec
 6 | from sklearn.neighbors import NearestNeighbors
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | def build_annoy_dictionary_word2vec(
11 |         df,
12 |         model_path,
13 |         word2vec_cols,
14 |         word2vec_size,
15 |         n_trees):
16 | 
17 |     model = Word2Vec.load(model_path)
18 |     wv = model.wv
19 | 
20 |     # type : [cols]
21 |     # ("ip": ["srcip", "dstip"])
22 |     # "port": ["srcport", "dstport"]
23 |     # "proto": ["proto"]
24 |     dict_type_cols = {}
25 |     for col in word2vec_cols:
26 |         type = col.encoding.split("_")[1]
27 |         if type not in dict_type_cols:
28 |             dict_type_cols[type] = []
29 |         dict_type_cols[type].append(col.column)
30 |     print(dict_type_cols)
31 | 
32 |     sets = []
33 |     dict_type_annDictPair = {}
34 |     for type, cols in dict_type_cols.items():
35 |         type_set = set(list(itertools.chain.from_iterable(
36 |             [list(df[col]) for col in cols])))
37 |         type_ann = AnnoyIndex(word2vec_size, 'angular')
38 |         type_dict = {}
39 |         index = 0
40 | 
41 |         for ele in type_set:
42 |             type_ann.add_item(index, get_vector(
43 |                 model, str(ele), norm_option=True))
44 |             type_dict[index] = ele
45 |             index += 1
46 |         type_ann.build(n_trees)
47 | 
48 |         dict_type_annDictPair[type] = (type_ann, type_dict)
49 | 
50 |     print("Finish building Angular trees...")
51 | 
52 |     return dict_type_annDictPair
53 | 
54 | 
55 | def get_original_obj(ann, vector, dic):
56 |     obj_list = ann.get_nns_by_vector(
57 |         vector, 1, search_k=-1, include_distances=False)
58 | 
59 |     return dic[obj_list[0]]
60 | 
61 | 
62 | def get_original_objs(ann, vectors, dic):
63 |     res = []
64 |     for vector in vectors:
65 |         obj_list = ann.get_nns_by_vector(
66 |             vector, 1, search_k=-1, include_distances=False)
67 |         res.append(dic[obj_list[0]])
68 |     return res
69 | 
70 | # return vector for the given word
71 | 
72 | 
73 | def get_vector(model, word, norm_option=False):
74 |     all_words_str = list(model.wv.vocab.keys())
75 | 
76 |     # Privacy-related
77 |     # If word not in the vocabulary, replace with nearest neighbor
78 |     # Suppose that protocol is covered
79 |     #   while very few port numbers are out of range
80 |     if word not in all_words_str:
81 |         print(f"{word} not in dict")
82 |         print("Help!!!!")
83 |         all_words = []
84 |         for ele in all_words_str:
85 |             if ele.isdigit():
86 |                 all_words.append(int(ele))
87 |         all_words = np.array(all_words).reshape((-1, 1))
88 |         nbrs = NearestNeighbors(
89 |             n_neighbors=1, algorithm='ball_tree').fit(all_words)
90 |         distances, indices = nbrs.kneighbors([[int(word)]])
91 |         nearest_word = str(all_words[indices[0][0]][0])
92 |         # print("nearest_word:", nearest_word)
93 |         model.init_sims()
94 |         return model.wv.word_vec(nearest_word, use_norm=norm_option)
95 |     else:
96 |         model.init_sims()
97 |         return model.wv.word_vec(word, use_norm=norm_option)
98 | 


--------------------------------------------------------------------------------
/netshare/pre_post_processors/netshare/main.c:
--------------------------------------------------------------------------------
  1 | // https://www.binarytides.com/packet-sniffer-code-c-libpcap-linux-sockets/
  2 | // http://tonylukasavage.com/blog/2010/12/19/offline-packet-capture-analysis-with-c-c----amp--libpcap/
  3 | 
  4 | #include <stdlib.h>
  5 | #include <stdio.h>
  6 | #include <stdint.h>
  7 | #include <limits.h>
  8 | #include <time.h>
  9 | #include <string.h>
 10 | #include <sys/time.h>
 11 | #include <math.h>
 12 | #include <string.h>
 13 | #include <tgmath.h>
 14 | #include <x86intrin.h>
 15 | 
 16 | /*
 17 | pcap and network related
 18 | */
 19 | #include <pcap/pcap.h>
 20 | #include <net/ethernet.h>
 21 | #include <netinet/ip.h>
 22 | #include <netinet/in.h>
 23 | #include <netinet/tcp.h>
 24 | #include <netinet/udp.h>
 25 | #include <arpa/inet.h>
 26 | 
 27 | #include "packet.h"
 28 | 
 29 | #define ETHER_HDR_TRUNCATE 0
 30 | 
 31 | int tcp = 0, udp = 0, icmp = 0, others = 0, total = 0;
 32 | 
 33 | void packetHandler(u_char *userData, const struct pcap_pkthdr *pkthdr, const u_char *packet)
 34 | {
 35 |     struct ether_header *ep;
 36 |     unsigned short ether_type;
 37 | 
 38 |     // read Ethernet header if not truncated
 39 |     if (!ETHER_HDR_TRUNCATE)
 40 |     {
 41 |         ep = (struct ether_header *)packet;
 42 | 
 43 |         // protocol type
 44 |         ether_type = ntohs(ep->ether_type);
 45 | 
 46 |         // IPv4
 47 |         if (ether_type == ETHERTYPE_IP)
 48 |         {
 49 |             packet += ETHER_HDR_LEN;
 50 |         }
 51 | 
 52 |         // 802.1Q
 53 |         else if (ether_type == ETHERTYPE_VLAN)
 54 |         {
 55 |             ether_type = ntohs(*(uint16_t *)(packet + 16));
 56 | 
 57 |             // only process IP packet
 58 |             if (ether_type == ETHERTYPE_IP)
 59 |             {
 60 |                 packet += ETHER_HDR_LEN;
 61 |                 packet += 4;
 62 |             }
 63 | 
 64 |             else
 65 |                 return;
 66 |         }
 67 |     }
 68 | 
 69 |     Packet p;
 70 | 
 71 |     /* Timestamp */
 72 |     unsigned long time_in_micros = pkthdr->ts.tv_sec * 1000000 + pkthdr->ts.tv_usec;
 73 |     p.timestamp = time_in_micros;
 74 |     // printf("timestamp: %lu\n", time_in_micros);
 75 | 
 76 |     /* IP header */
 77 |     // Note: caida traces does not include ethernet headers
 78 |     // data_center (IMC 2010): enternet headers, YES
 79 |     // MACCDC_2012
 80 |     const struct ip *ipHeader;
 81 |     char sourceIp[INET_ADDRSTRLEN];
 82 |     char destIp[INET_ADDRSTRLEN];
 83 | 
 84 |     ipHeader = (struct ip *)(packet);
 85 | 
 86 |     inet_ntop(AF_INET, &(ipHeader->ip_src), sourceIp, INET_ADDRSTRLEN);
 87 |     inet_ntop(AF_INET, &(ipHeader->ip_dst), destIp, INET_ADDRSTRLEN);
 88 | 
 89 |     struct in_addr tmpPkt1, tmpPkt2;
 90 |     inet_aton(sourceIp, &tmpPkt1);
 91 |     inet_aton(destIp, &tmpPkt2);
 92 | 
 93 |     p.srcip = ntohl(tmpPkt1.s_addr);
 94 |     p.dstip = ntohl(tmpPkt2.s_addr);
 95 | 
 96 |     p.ip_hl = (unsigned int)ipHeader->ip_hl;
 97 |     p.ip_v = (unsigned int)ipHeader->ip_v;
 98 |     p.ip_tos = (uint8_t)ipHeader->ip_tos;
 99 |     p.ip_len = ntohs(ipHeader->ip_len);
100 |     p.ip_id = ntohs(ipHeader->ip_id);
101 |     p.ip_off = ntohs(ipHeader->ip_off);
102 |     p.ip_ttl = (uint8_t)ipHeader->ip_ttl;
103 |     p.ip_p = (uint8_t)ipHeader->ip_p;
104 |     p.ip_sum = ntohs(ipHeader->ip_sum);
105 | 
106 |     // TCP/UDP
107 |     total++;
108 |     switch (ipHeader->ip_p)
109 |     {
110 |     // ICMP Protocol
111 |     case 1:
112 |         icmp++;
113 |         break;
114 | 
115 |     // TCP Protocol
116 |     case 6:
117 |         tcp++;
118 | 
119 |         struct tcphdr *tcpHeader = (struct tcphdr *)(packet + p.ip_hl * 4);
120 |         p.srcport = ntohs(tcpHeader->th_sport);
121 |         p.dstport = ntohs(tcpHeader->th_dport);
122 |         // printf("%hu, %hu\n", p.srcport, p.dstport);
123 |         break;
124 | 
125 |     // UDP Protocol
126 |     case 17:
127 |         udp++;
128 |         struct udphdr *udpHeader = (struct udphdr *)(packet + p.ip_hl * 4);
129 |         p.srcport = ntohs(udpHeader->uh_sport);
130 |         p.dstport = ntohs(udpHeader->uh_dport);
131 |         // printf("%hu, %hu\n", p.srcport, p.dstport);
132 |         break;
133 | 
134 |     default:
135 |         others++;
136 |         break;
137 |     }
138 | 
139 |     // printf("%u, %u, %hu, %hu, %u, %lu, %hu, %u, %u, %u, %hu, %u, %hu\n", p.srcip, p.dstip, p.srcport, p.dstport, p.ip_p, p.timestamp, p.ip_len, p.ip_v, p.ip_hl, p.ip_tos, p.ip_id, p.ip_ttl, p.ip_sum);
140 | 
141 |     trace_pkts = (Packet *)realloc(trace_pkts, (trace_count + 1) * sizeof(Packet));
142 |     trace_pkts[trace_count] = p;
143 | 
144 |     trace_count++;
145 | }
146 | 
147 | void pcapParser(char *fileName)
148 | {
149 |     pcap_t *descr;
150 |     char errbuf[PCAP_ERRBUF_SIZE];
151 | 
152 |     // open trace file for offline processing
153 |     printf("Pre-process pcap file %s\n", fileName);
154 |     trace_count = 0;
155 | 
156 |     descr = pcap_open_offline(fileName, errbuf);
157 | 
158 |     if (descr == NULL)
159 |     {
160 |         printf("[FILE ERROR] pcap_open_live() failed: \n");
161 |     }
162 | 
163 |     // start packet processing loop, just like live capture
164 |     if (pcap_loop(descr, 0, packetHandler, NULL) < 0)
165 |     {
166 |         printf("pcap_loop() failed: %s\n", pcap_geterr(descr));
167 |     }
168 | 
169 |     printf("This pcap chunk reading is done... total %d packets \n", trace_count);
170 | }
171 | 
172 | int pcap2csv(char *pcapFile, char *csvFile)
173 | {
174 |     printf("pcap file: %s\n", pcapFile);
175 |     printf("csv file: %s\n", csvFile);
176 | 
177 |     pcapParser(pcapFile);
178 |     printf("TCP: %d, UDP: %d, ICMP: %d, Others: %d, total: %d\n", tcp, udp, icmp, others, total);
179 | 
180 |     FILE *fp;
181 |     fp = fopen(csvFile, "w+");
182 | 
183 |     fprintf(fp, "srcip,dstip,srcport,dstport,proto,time,pkt_len,version,ihl,tos,id,flag,off,ttl,chksum\n");
184 | 
185 |     for (int i = 0; i < trace_count; i++)
186 |     {
187 |         Packet p = trace_pkts[i];
188 | 
189 |         unsigned short int ip_flag = p.ip_off >> 13;
190 |         unsigned short int ip_off = p.ip_off & IP_OFFMASK;
191 | 
192 |         char proto[128];
193 |         if (p.ip_p == 6)
194 |         {
195 |             strcpy(proto, "TCP");
196 |         }
197 |         else if (p.ip_p == 17)
198 |         {
199 |             strcpy(proto, "UDP");
200 |         }
201 |         else
202 |         {
203 |             printf("Not TCP/UDP packet!\n");
204 |         }
205 | 
206 |         fprintf(fp, "%u,%u,%hu,%hu,%s,%lu,%hu,%u,%u,%u,%hu,%hu,%hu,%u,%hu\n", p.srcip, p.dstip, p.srcport, p.dstport, proto, p.timestamp, p.ip_len, p.ip_v, p.ip_hl, p.ip_tos, p.ip_id, ip_flag, ip_off, p.ip_ttl, p.ip_sum);
207 |     }
208 | 
209 |     fclose(fp);
210 | 
211 |     return 0;
212 | }


--------------------------------------------------------------------------------
/netshare/pre_post_processors/netshare/packet.h:
--------------------------------------------------------------------------------
 1 | #ifndef PACKET_H
 2 | #define PACKET_H
 3 | 
 4 | // http://yuba.stanford.edu/~casado/pcap/section4.html
 5 | 
 6 | #include <stdio.h>
 7 | 
 8 | // Self-defined packet structure
 9 | typedef struct Packet
10 | {   
11 |     // Timestamp in microseconds
12 |     unsigned long timestamp;    /* timestamp */
13 | 
14 |     // IP header
15 |     unsigned int ip_hl;         /* header length */
16 |     unsigned int ip_v;          /* version */
17 |     uint8_t ip_tos;             /* type of service */
18 |     u_short ip_len;             /* total length */
19 |     u_short ip_id;              /* identification */
20 |     u_short ip_off;             /* fragment offset field */
21 |     uint8_t ip_ttl;             /* time to live */
22 |     uint8_t ip_p;               /* protocol */
23 |     u_short ip_sum;             /* checksum */
24 |     uint32_t srcip;             /* source IP */
25 |     uint32_t dstip;             /* destination IP */
26 | 
27 |     // TCP/UDP
28 |     u_short srcport;            /* source port */
29 |     u_short dstport;            /* destination port */
30 | }Packet; 
31 | 
32 | Packet* trace_pkts;
33 | int trace_count = 0;
34 | 
35 | #endif


--------------------------------------------------------------------------------
/netshare/pre_post_processors/netshare/sharedlib.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cc -fPIC -shared -o pcap2csv.so main.c -lm -lpcap


--------------------------------------------------------------------------------
/netshare/pre_post_processors/netshare/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import pickle
  4 | import math
  5 | import json
  6 | import ast
  7 | import socket
  8 | import struct
  9 | import ipaddress
 10 | import pandas as pd
 11 | import numpy as np
 12 | from tqdm import tqdm
 13 | from pathlib import Path
 14 | from scapy.all import IP, ICMP, TCP, UDP
 15 | from scapy.all import wrpcap
 16 | from scipy.stats import rankdata
 17 | from .embedding_helper import (
 18 |     build_annoy_dictionary_word2vec,
 19 |     get_original_obj
 20 | )
 21 | from .dist_metrics import (
 22 |     compute_metrics_netflow_v3,
 23 |     compute_metrics_pcap_v3,
 24 |     compute_metrics_zeeklog_v3
 25 | )
 26 | from ...model_managers.netshare_manager.netshare_util import get_configid_from_kv
 27 | 
 28 | 
 29 | def convert_sdmetricsConfigQuant_to_fieldValueDict(
 30 |     sdmetricsConfigQuant
 31 | ):
 32 |     '''Convert the sdmetricsConfigQuant to fieldValueDict
 33 |     Args:
 34 |         sdmetricsConfigQuant (dict): returned by create_sdmetrics_config(...,  comparison_type='quantitative')
 35 |     Returns:
 36 |         fieldValueDict (dict): {field_name: value}
 37 |     '''
 38 | 
 39 |     fieldValueDict = {}
 40 |     for metric_type, metrics in sdmetricsConfigQuant.items():
 41 |         for metric_class_name, metric_class in metrics.items():
 42 |             # metrics with target (e.g., attr dist similarity)
 43 |             if isinstance(metric_class, dict):
 44 |                 for field_name, field_value in metric_class.items():
 45 |                     fieldValueDict[ast.literal_eval(
 46 |                         field_name)[0]] = field_value[0][0]
 47 |             # metrics without target (e.g., session length)
 48 |             elif isinstance(metric_class, list):
 49 |                 fieldValueDict[metric_class_name] = metric_class[0][0]
 50 | 
 51 |     return fieldValueDict
 52 | 
 53 | 
 54 | def create_sdmetrics_config(
 55 |     config_pre_post_processor,
 56 |     comparison_type='both'
 57 | ):
 58 |     # Refer to https://github.com/netsharecmu/SDMetrics_timeseries/blob/master/sdmetrics/reports/timeseries/sunglasses_qr.json to see the format of the config file
 59 |     sdmetrics_config = {
 60 |         "metadata": {
 61 |             "fields": {}
 62 |         },
 63 |         "config": {
 64 |             "metrics": {
 65 |                 "fidelity": []
 66 |             }
 67 |         }
 68 |     }
 69 | 
 70 |     # Enumerate through all the fields in the metadata, timeseries, and timestamp
 71 |     for i, field in enumerate(config_pre_post_processor.metadata +
 72 |                               config_pre_post_processor.timeseries):
 73 |         if field in config_pre_post_processor.metadata:
 74 |             metric_class_name = "Single attribute distributional similarity"
 75 |             class_name = "AttrDistSimilarity"
 76 |         elif field in config_pre_post_processor.timeseries:
 77 |             metric_class_name = "Single feature distributional similarity"
 78 |             class_name = "FeatureDistSimilarity"
 79 |         if 'bit' in getattr(field, 'encoding', '') or \
 80 |             'word2vec' in getattr(field, 'encoding', '') or \
 81 |                 'categorical' in getattr(field, 'encoding', ''):
 82 |             sdmetrics_config["metadata"]["fields"][
 83 |                 field.column] = {
 84 |                 "type": "categorical"}
 85 |         if getattr(field, 'type', '') == 'float':
 86 |             sdmetrics_config["metadata"]["fields"][
 87 |                 field.column] = {
 88 |                 "type": "numerical"}
 89 |         sdmetrics_config["config"]["metrics"]["fidelity"].append(
 90 |             {
 91 |                 metric_class_name: {
 92 |                     "class": class_name,
 93 |                     "target_list": [[field.column]],
 94 |                     "configs": {
 95 |                         "categorical_mapping": getattr(field, 'categorical_mapping', True),
 96 |                         "comparison_type": comparison_type
 97 |                     }
 98 |                 }
 99 |             }
100 |         )
101 | 
102 |     # Add session length metric if the dataset is a pcap
103 |     if config_pre_post_processor.dataset_type == 'pcap':
104 |         sdmetrics_config["config"]["metrics"]["fidelity"].append(
105 |             {
106 |                 "Session length distributional similarity": {
107 |                     "class": "SessionLengthDistSimilarity",
108 |                     "configs": {
109 |                         "comparison_type": comparison_type
110 |                     }
111 |                 }
112 |             }
113 |         )
114 |     if config_pre_post_processor.timestamp.generation:
115 |         sdmetrics_config["metadata"]["fields"][
116 |             config_pre_post_processor.timestamp.column] = {
117 |             "type": "numerical"}
118 |         sdmetrics_config["config"]["metrics"]["fidelity"].append(
119 |             {
120 |                 "Single feature distributional similarity": {
121 |                     "class": "FeatureDistSimilarity",
122 |                     "target_list": [
123 |                         [
124 |                             config_pre_post_processor.timestamp.column
125 |                         ]
126 |                     ],
127 |                     "configs": {
128 |                         "comparison_type": comparison_type
129 |                     }
130 |                 }
131 |             }
132 |         )
133 |     sdmetrics_config["metadata"]["entity_columns"] = [
134 |         field.column for field in config_pre_post_processor.metadata
135 |     ]
136 |     sdmetrics_config["metadata"]["sequence_index"] = config_pre_post_processor.timestamp.column if config_pre_post_processor.timestamp.generation else None
137 |     sdmetrics_config["metadata"]["context_columns"] = []
138 | 
139 |     return sdmetrics_config
140 | 
141 | 
142 | def _last_lvl_folder(folder):
143 |     return str(Path(folder).parents[0])
144 | 
145 | 
146 | def IP_int2str(IP_int):
147 |     return str(ipaddress.ip_address(IP_int))
148 | 
149 | 
150 | def IP_str2int(IP_str):
151 |     return int(ipaddress.ip_address(IP_str))
152 | 
153 | 
154 | def IPs_int2str(IPs_int):
155 |     return [IP_int2str(i) for i in IPs_int]
156 | 
157 | 
158 | def IPs_str2int(IPs_str):
159 |     return [IP_str2int(i) for i in IPs_str]
160 | 
161 | 
162 | pr_dict = {
163 |     "ESP": 50,
164 |     "GRE": 47,
165 |     "ICMP": 1,
166 |     "IPIP": 4,
167 |     "IPv6": 41,
168 |     "TCP": 6,
169 |     "UDP": 17,
170 |     "Other": 255
171 | }
172 | 
173 | 
174 | def prs_str2int(prs):
175 |     prs_int = []
176 |     for p in prs:
177 |         prs_int.append(pr_dict[p])
178 |     return prs_int
179 | 
180 | 
181 | pr_int2str_dict = {
182 |     1: "ICMP",
183 |     4: "IPIP",
184 |     6: "TCP",
185 |     17: "UDP",
186 |     41: "IPv6",
187 |     47: "GRE",
188 |     50: "ESP",
189 |     255: "Other"
190 | }
191 | 
192 | 
193 | def prs_int2str(prs_int):
194 |     prs_str = []
195 |     for p in prs_int:
196 |         prs_str.append(pr_int2str_dict[p])
197 |     return prs_str
198 | 
199 | 
200 | def csv2pcap_single(input, output):
201 |     # df = pd.read_csv(input).sort_values(["time"])
202 |     df = input.sort_values(["time"])
203 | 
204 |     packets = []
205 | 
206 |     for i, row in tqdm(df.iterrows(), total=df.shape[0]):
207 |         time = float(row["time"] / 10**6)
208 |         if isinstance(row["srcip"], str):
209 |             srcip = IP_str2int(row["srcip"])
210 |             dstip = IP_str2int(row["dstip"])
211 |             src = socket.inet_ntoa(struct.pack('!L', srcip))
212 |             dst = socket.inet_ntoa(struct.pack('!L', dstip))
213 |         else:
214 |             src = socket.inet_ntoa(struct.pack('!L', row["srcip"]))
215 |             dst = socket.inet_ntoa(struct.pack('!L', row["dstip"]))
216 | 
217 |         srcport = row["srcport"]
218 |         dstport = row["dstport"]
219 |         proto = row["proto"]
220 |         pkt_len = int(row["pkt_len"])
221 | 
222 |         try:
223 |             proto = int(proto)
224 |         except BaseException:
225 |             if proto == "TCP":
226 |                 proto = 6
227 |             elif proto == "UDP":
228 |                 proto = 17
229 |             elif proto == "ICMP":
230 |                 proto = 1
231 |             else:
232 |                 proto = 0
233 | 
234 |         ip = IP(src=src, dst=dst, len=pkt_len, proto=proto)
235 |         if proto == 1:
236 |             p = ip / ICMP()
237 |         elif proto == 6:
238 |             tcp = TCP(sport=srcport, dport=dstport)
239 |             p = ip / tcp
240 |         elif proto == 17:
241 |             udp = UDP(sport=srcport, dport=dstport)
242 |             p = ip / udp
243 |         else:
244 |             p = ip
245 | 
246 |         p.time = time
247 |         p.len = pkt_len
248 |         p.wirelen = pkt_len + 4
249 | 
250 |         packets.append(p)
251 | 
252 |     wrpcap(output, packets)
253 | 


--------------------------------------------------------------------------------
/netshare/pre_post_processors/netshare/word2vec_embedding.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | from gensim.models import Word2Vec
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | from .embedding_helper import build_annoy_dictionary_word2vec
 9 | from .embedding_helper import get_original_obj, get_vector
10 | from sklearn.neighbors import NearestNeighbors
11 | 
12 | 
13 | def test_embed_bidirectional(model_path, ann, dic, word):
14 |     model = Word2Vec.load(model_path)
15 | 
16 |     raw_vec = get_vector(model, word, False)
17 |     normed_vec = get_vector(model, word, True)
18 | 
19 |     print("word: {}, vector(raw): {}".format(word, raw_vec))
20 |     print("word: {}, vector(l2-norm): {}".format(word, normed_vec))
21 | 
22 |     print("vec(raw): {}, word: {}".format(
23 |         raw_vec, get_original_obj(ann, raw_vec, dic)))
24 |     print("vec(l2-norm): {}, word: {}".format(normed_vec,
25 |           get_original_obj(ann, normed_vec, dic)))
26 |     print()
27 | 
28 | 
29 | def test_model(
30 |     df,
31 |     model_path,
32 |     word2vec_cols,
33 |     word2vec_size,
34 |     annoy_n_trees
35 | ):
36 |     dict_type_annDictPair = build_annoy_dictionary_word2vec(
37 |         df=df,
38 |         model_path=model_path,
39 |         word2vec_cols=word2vec_cols,
40 |         word2vec_size=word2vec_size,
41 |         n_trees=annoy_n_trees
42 |     )
43 | 
44 |     for col in word2vec_cols:
45 |         type = col.encoding.split("_")[1]
46 |         word = random.choice(df[col.column])
47 |         print("Testing {col.column}...")
48 |         test_embed_bidirectional(
49 |             model_path=model_path,
50 |             ann=dict_type_annDictPair[type][0],
51 |             dic=dict_type_annDictPair[type][1],
52 |             word=word)
53 | 
54 | 
55 | def word2vec_train(
56 |     df,
57 |     out_dir,
58 |     model_name,
59 |     word2vec_cols,
60 |     word2vec_size,
61 |     annoy_n_trees,
62 |     force_retrain=False,  # retrain from scratch
63 |     model_test=False
64 | ):
65 |     model_path = os.path.join(
66 |         out_dir,
67 |         "{}_{}.model".format(model_name, word2vec_size))
68 | 
69 |     if os.path.exists(model_path) and not force_retrain:
70 |         print("Loading Word2Vec pre-trained model...")
71 |         model = Word2Vec.load(model_path)
72 |     else:
73 |         print("Training Word2Vec model from scratch...")
74 |         sentences = []
75 |         for row in range(0, len(df)):
76 |             sentence = [str(df.at[row, col])
77 |                         for col in [c.column for c in word2vec_cols]]
78 |             sentences.append(sentence)
79 | 
80 |         model = Word2Vec(
81 |             sentences=sentences,
82 |             size=word2vec_size,
83 |             window=5,
84 |             min_count=1,
85 |             workers=10)
86 |         model.save(model_path)
87 |     print(f"Word2Vec model is saved at {model_path}")
88 | 
89 |     if model_test:
90 |         test_model(
91 |             df=df,
92 |             model_path=model_path,
93 |             word2vec_cols=word2vec_cols,
94 |             word2vec_size=word2vec_size,
95 |             annoy_n_trees=annoy_n_trees
96 |         )
97 | 
98 |     return model_path
99 | 


--------------------------------------------------------------------------------
/netshare/pre_post_processors/pre_post_processor.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | import os
 3 | 
 4 | from netshare.utils import Tee
 5 | 
 6 | 
 7 | class PrePostProcessor(ABC):
 8 |     def __init__(self, config):
 9 |         self._config = config
10 | 
11 |     @abstractmethod
12 |     def _pre_process(self, input_folder, output_folder, log_folder):
13 |         ...
14 | 
15 |     @abstractmethod
16 |     def _post_process(self, input_folder, output_folder,
17 |                       pre_processed_data_folder, log_folder):
18 |         ...
19 | 
20 |     def pre_process(self, input_folder, output_folder, log_folder):
21 |         stdout_log_path = os.path.join(log_folder, 'pre_process.stdout.log')
22 |         stderr_log_path = os.path.join(log_folder, 'pre_process.stderr.log')
23 |         with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path):
24 |             return self._pre_process(
25 |                 input_folder=input_folder,
26 |                 output_folder=output_folder,
27 |                 log_folder=log_folder)
28 | 
29 |     def post_process(self, input_folder, output_folder,
30 |                      pre_processed_data_folder, log_folder):
31 |         stdout_log_path = os.path.join(log_folder, 'post_process.stdout.log')
32 |         stderr_log_path = os.path.join(log_folder, 'post_process.stderr.log')
33 |         with Tee(stdout_path=stdout_log_path, stderr_path=stderr_log_path):
34 |             return self._post_process(
35 |                 input_folder=input_folder,
36 |                 output_folder=output_folder,
37 |                 pre_processed_data_folder=pre_processed_data_folder,
38 |                 log_folder=log_folder)
39 | 


--------------------------------------------------------------------------------
/netshare/ray/__init__.py:
--------------------------------------------------------------------------------
1 | from .remote import remote, get
2 | from .config import config
3 | from .ray_functions import init, shutdown
4 | 
5 | 
6 | __all__ = ['config', 'init', 'shutdown', 'remote', 'get']
7 | 


--------------------------------------------------------------------------------
/netshare/ray/config.py:
--------------------------------------------------------------------------------
1 | from addict import Dict
2 | 
3 | config = Dict(
4 |     enabled=True)
5 | config.freeze()
6 | 


--------------------------------------------------------------------------------
/netshare/ray/ray_functions.py:
--------------------------------------------------------------------------------
 1 | from .config import config as ray_config
 2 | 
 3 | def init(*args, **kwargs):
 4 |     if ray_config.enabled:
 5 |         print('Ray is enabled')
 6 |         import ray
 7 |         ray.init(*args, **kwargs)
 8 |     else:
 9 |         print('Ray is disabled')
10 | 
11 | 
12 | def shutdown(*args, **kargs):
13 |     if ray_config.enabled:
14 |         print('Ray is enabled')
15 |         import ray
16 |         ray.shutdown(*args, **kargs)
17 |     else:
18 |         print("Ray is disabled")
19 | 


--------------------------------------------------------------------------------
/netshare/ray/remote.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | from .config import config as ray_config
 3 | 
 4 | 
 5 | class ResultWrapper(object):
 6 |     def __init__(self, result):
 7 |         self._result = result
 8 | 
 9 |     def get_result(self):
10 |         return self._result
11 | 
12 | 
13 | class RemoteFunctionWrapper(object):
14 |     def __init__(self, *args, **kwargs):
15 |         self._actual_remote_function = None
16 |         self._ray_args = args
17 |         self._ray_kwargs = kwargs
18 | 
19 |     def __call__(self, *args, **kwargs):
20 |         raise TypeError('Remote functions cannot be called directly.')
21 | 
22 |     def remote(self, *args, **kwargs):
23 |         if ray_config.enabled:
24 |             if self._actual_remote_function is None:
25 |                 import ray
26 |                 if len(self._ray_kwargs) == 0:
27 |                     self._actual_remote_function = ray.remote(
28 |                         *self._ray_args, **self._ray_kwargs)
29 |                 else:
30 |                     self._actual_remote_function = ray.remote(
31 |                         **self._ray_kwargs)(*self._ray_args)
32 |             return self._actual_remote_function.remote(*args, **kwargs)
33 |         else:
34 |             return ResultWrapper(self._ray_args[0](*args, **kwargs))
35 | 
36 | 
37 | def remote(*args, **kwargs):
38 |     if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
39 |         # This is the case where the decorator is just @ray.remote.
40 |         # "args[0]" is the class or function under the decorator.
41 |         return RemoteFunctionWrapper(args[0])
42 |     if not (len(args) == 0 and len(kwargs) > 0):
43 |         raise ValueError('Error in the parameters of the decorator')
44 |     return functools.partial(RemoteFunctionWrapper, **kwargs)
45 | 
46 | 
47 | def get(object_refs, **kwargs):
48 |     if ray_config.enabled:
49 |         import ray
50 |         return ray.get(object_refs, **kwargs)
51 |     else:
52 |         if isinstance(object_refs, ResultWrapper):
53 |             return object_refs.get_result()
54 |         elif isinstance(object_refs, list):
55 |             return [object_ref.get_result() for object_ref in object_refs]
56 | 


--------------------------------------------------------------------------------
/netshare/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .tee import Tee
2 | from .field import ContinuousField, DiscreteField, BitField, Word2VecField
3 | from .output import OutputType, Normalization, Output
4 | from .exec_cmd import exec_cmd
5 | 
6 | __all__ = ['Tee', 'ContinuousField', 'DiscreteField', 'BitField',
7 |            'Word2VecField', 'OutputType', 'Normalization', 'Output', 'exec_cmd']
8 | 


--------------------------------------------------------------------------------
/netshare/utils/exec_cmd.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | 
3 | 
4 | def exec_cmd(cmd, wait=False):
5 |     p = subprocess.Popen(cmd, shell=True)
6 |     if wait:
7 |         p.wait()
8 | 


--------------------------------------------------------------------------------
/netshare/utils/field.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import numpy as np
  4 | import pandas as pd
  5 | from typing import Any, Dict, List
  6 | from collections import defaultdict
  7 | from annoy import AnnoyIndex
  8 | 
  9 | from .output import Normalization, OutputType, Output
 10 | from ..pre_post_processors.netshare.embedding_helper import get_vector, get_original_obj, get_original_objs
 11 | 
 12 | EPS = 1e-8
 13 | 
 14 | 
 15 | class Field(object):
 16 |     def __init__(self, name):
 17 |         self.name = name
 18 | 
 19 |     def normalize(self):
 20 |         raise NotImplementedError
 21 | 
 22 |     def denormalize(self):
 23 |         raise NotImplementedError
 24 | 
 25 |     def getOutputType(self):
 26 |         raise NotImplementedError
 27 | 
 28 | 
 29 | class ContinuousField(Field):
 30 |     def __init__(
 31 |             self, norm_option, min_x=None, max_x=None, dim_x=1,
 32 |             log1p_norm=False, *args, **kwargs):
 33 |         super(ContinuousField, self).__init__(*args, **kwargs)
 34 | 
 35 |         self.min_x = min_x
 36 |         self.max_x = max_x
 37 |         self.norm_option = norm_option
 38 |         self.dim_x = dim_x
 39 |         self.log1p_norm = log1p_norm
 40 |         if self.log1p_norm:
 41 |             self.min_x = np.log1p(self.min_x)
 42 |             self.max_x = np.log1p(self.max_x)
 43 | 
 44 |     # Normalize x in [a, b]: x' = (b-a)(x-min x)/(max x - minx) + a
 45 |     def normalize(self, x):
 46 |         if x.shape[-1] != self.dim_x:
 47 |             raise ValueError(f"Dimension is {x.shape[-1]}. "
 48 |                              f"Expected dimension is {self.dim_x}")
 49 |         if self.log1p_norm:
 50 |             x = np.log1p(x)
 51 | 
 52 |         # [0, 1] normalization
 53 |         if self.norm_option == Normalization.ZERO_ONE:
 54 |             return np.asarray((x - self.min_x) / (self.max_x - self.min_x))
 55 | 
 56 |         # [-1, 1] normalization
 57 |         elif self.norm_option == Normalization.MINUSONE_ONE:
 58 |             return np.asarray(2 * (x - self.min_x)
 59 |                               / (self.max_x - self.min_x) - 1)
 60 |         else:
 61 |             raise Exception("Not valid normalization option!")
 62 | 
 63 |     def denormalize(self, norm_x):
 64 |         if norm_x.shape[-1] != self.dim_x:
 65 |             raise ValueError(f"Dimension is {norm_x.shape[-1]}. "
 66 |                              f"Expected dimension is {self.dim_x}")
 67 |         norm_x = norm_x.astype(np.float64)  # Convert to float64 for precision
 68 | 
 69 |         # [0, 1] normalization
 70 |         if self.norm_option == Normalization.ZERO_ONE:
 71 |             to_return = norm_x * float(self.max_x - self.min_x) + self.min_x
 72 | 
 73 |         # [-1, 1] normalization
 74 |         elif self.norm_option == Normalization.MINUSONE_ONE:
 75 |             to_return = (norm_x + 1) / 2.0 * \
 76 |                 float(self.max_x - self.min_x) + self.min_x
 77 | 
 78 |         else:
 79 |             raise Exception("Not valid normalization option!")
 80 | 
 81 |         if self.log1p_norm:
 82 |             to_return = np.expm1(to_return)
 83 | 
 84 |         return to_return
 85 | 
 86 |     def getOutputType(self):
 87 |         return Output(
 88 |             type_=OutputType.CONTINUOUS,
 89 |             dim=self.dim_x,
 90 |             normalization=self.norm_option
 91 |         )
 92 | 
 93 | 
 94 | class DiscreteField(Field):
 95 |     def __init__(self, choices, *args, **kwargs):
 96 |         super(DiscreteField, self).__init__(*args, **kwargs)
 97 | 
 98 |         if not isinstance(choices, list):
 99 |             raise Exception("choices should be a list")
100 |         self.choices = choices
101 |         self.dim_x = len(choices)
102 | 
103 |     def normalize(self, x):
104 |         if not isinstance(x, (list, np.ndarray)):
105 |             norm_x = [x]
106 |         else:
107 |             norm_x = x
108 |         norm_x = pd.DataFrame(norm_x).astype(
109 |             pd.CategoricalDtype(categories=self.choices))
110 |         norm_x = pd.get_dummies(norm_x).to_numpy()
111 |         if not isinstance(x, (list, np.ndarray)):
112 |             norm_x = norm_x[0]
113 | 
114 |         return norm_x
115 | 
116 |     def denormalize(self, norm_x):
117 |         index = np.argmax(norm_x, axis=-1)
118 | 
119 |         return np.asarray(self.choices)[index]
120 | 
121 |     def getOutputType(self):
122 |         return Output(
123 |             type_=OutputType.DISCRETE,
124 |             dim=len(self.choices)
125 |         )
126 | 
127 | 
128 | class BitField(Field):
129 |     def __init__(self, num_bits, *args, **kwargs):
130 |         super(BitField, self).__init__(*args, **kwargs)
131 | 
132 |         self.num_bits = num_bits
133 |         self.dim_x = 2*num_bits
134 | 
135 |     def normalize(self, decimal_x):
136 |         bin_x = bin(int(decimal_x))[2:].zfill(self.num_bits)
137 |         bin_x = [int(b) for b in bin_x]
138 | 
139 |         bits = []
140 |         for b in bin_x:
141 |             if b == 0:
142 |                 bits += [1.0, 0.0]
143 | 
144 |             elif b == 1:
145 |                 bits += [0.0, 1.0]
146 | 
147 |             else:
148 |                 print("Binary number is zero or one!")
149 | 
150 |         return bits
151 | 
152 |     def denormalize(self, bin_x):
153 |         if len(bin_x.shape) == 3:
154 |             # This is a timeseries field
155 |             a, b, c = bin_x.shape
156 |             if self.num_bits * 2 != c:
157 |                 raise ValueError(
158 |                     f"Dimension is {c}. Expected dimension is {self.num_bits * 2}"
159 |                 )
160 |             return self.denormalize(
161 |                 bin_x.reshape(a * b, c)).to_numpy().reshape(
162 |                 a, b)
163 |         df_bin = pd.DataFrame(bin_x)
164 |         chosen_bits = (df_bin > df_bin.shift(axis=1)).drop(
165 |             range(0, self.num_bits * 2, 2), axis=1
166 |         )
167 |         return chosen_bits.dot(1 << np.arange(self.num_bits - 1, -1, -1))
168 | 
169 |     def getOutputType(self):
170 |         outputs = []
171 | 
172 |         for i in range(self.num_bits):
173 |             outputs.append(Output(type_=OutputType.DISCRETE, dim=2))
174 | 
175 |         return outputs
176 | 
177 | 
178 | class Word2VecField(Field):
179 |     def __init__(
180 |             self, word2vec_size, pre_processed_data_folder, word2vec_type, *
181 |             args, **kwargs):
182 |         super(Word2VecField, self).__init__(*args, **kwargs)
183 | 
184 |         self.word2vec_size = word2vec_size
185 |         self.preprocessed_data_folder = pre_processed_data_folder
186 |         self.word2vec_type = word2vec_type
187 |         self.dim_x = word2vec_size
188 |         self.norm_option = Normalization.MINUSONE_ONE
189 | 
190 |     def normalize(self, x, embed_model):
191 |         return np.array(
192 |             [get_vector(embed_model, str(xi), norm_option=True) for xi in x]
193 |         )
194 | 
195 |     def denormalize(self, norm_x):
196 |         # load Annoy and Dict
197 |         type_ann = AnnoyIndex(self.word2vec_size, 'angular')
198 |         type_ann.load(os.path.join(
199 |             self.preprocessed_data_folder,
200 |             f"{self.word2vec_type}_ann.ann"))
201 |         with open(os.path.join(self.preprocessed_data_folder, f"{self.word2vec_type}_dict.json"), 'r') as f:
202 |             type_dict = json.load(f)
203 | 
204 |         if len(norm_x.shape) == 3:
205 |             # This is a timeseries field
206 |             return np.array(
207 |                 [
208 |                     get_original_objs(
209 |                         ann=type_ann,
210 |                         vectors=x,
211 |                         dic={int(k): v for k, v in type_dict.items()}
212 |                     )
213 |                     for x in norm_x
214 |                 ]
215 |             )
216 |         return np.asarray(get_original_objs(
217 |             ann=type_ann,
218 |             vectors=norm_x,
219 |             dic={int(k): v for k, v in type_dict.items()}
220 |         ))
221 | 
222 |     def getOutputType(self):
223 |         return Output(
224 |             type_=OutputType.CONTINUOUS,
225 |             dim=self.dim_x,
226 |             normalization=self.norm_option
227 |         )
228 | 


--------------------------------------------------------------------------------
/netshare/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import io
 3 | import logging
 4 | import sys
 5 | 
 6 | logger: logging.Logger = logging.getLogger("netshare")
 7 | 
 8 | handler = logging.StreamHandler(sys.stdout)
 9 | formatter = logging.Formatter(
10 |     "%(asctime)s - %(levelname)s - %(funcName)s - %(message)s"
11 | )
12 | handler.setFormatter(formatter)
13 | logger.addHandler(handler)
14 | logger.setLevel(logging.INFO)
15 | 
16 | 
17 | class TqdmToLogger(io.StringIO):
18 |     """
19 |     Util to output tqdm progress bar to the logger.
20 |     """
21 | 
22 |     def __init__(self, description: str) -> None:
23 |         super().__init__()
24 |         self.description = description
25 | 
26 |     def write(self, buf: str) -> int:
27 |         if buf.strip():
28 |             logger.debug(f"{self.description}: {buf.strip()}")
29 |         return len(buf)
30 | 


--------------------------------------------------------------------------------
/netshare/utils/output.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class OutputType(Enum):
 5 |     CONTINUOUS = "CONTINUOUS"
 6 |     DISCRETE = "DISCRETE"
 7 | 
 8 | 
 9 | class Normalization(Enum):
10 |     ZERO_ONE = "ZERO_ONE"
11 |     MINUSONE_ONE = "MINUSONE_ONE"
12 | 
13 | 
14 | class Output(object):
15 |     def __init__(self, type_, dim, normalization=None, is_gen_flag=False):
16 |         self.type_ = type_
17 |         self.dim = dim
18 |         self.normalization = normalization
19 |         self.is_gen_flag = is_gen_flag
20 | 
21 |         if type_ == OutputType.CONTINUOUS and normalization is None:
22 |             raise Exception("normalization must be set for continuous output")
23 | 


--------------------------------------------------------------------------------
/netshare/utils/tee.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import traceback
 3 | 
 4 | 
 5 | class DuplicateWriter(object):
 6 |     def __init__(self, file_objects):
 7 |         self._file_objects = file_objects
 8 | 
 9 |     def write(self, data):
10 |         for file_object in self._file_objects:
11 |             file_object.write(data)
12 |             file_object.flush()
13 | 
14 |     def writelines(self, data):
15 |         for file_object in self._file_objects:
16 |             file_object.write(data)
17 |             file_object.flush()
18 | 
19 |     def flush(self):
20 |         for file_object in self._file_objects:
21 |             file_object.flush()
22 | 
23 |     def close(self):
24 |         for file_object in self._file_objects:
25 |             file_object.close()
26 | 
27 | 
28 | class Tee(object):
29 |     def __init__(self, stdout_path, stderr_path):
30 |         self.stdout_file = open(stdout_path, 'w')
31 |         self.stderr_file = open(stderr_path, 'w')
32 |         self.stdout = sys.stdout
33 |         self.stderr = sys.stderr
34 |         self.stdout_writer = DuplicateWriter([sys.stdout, self.stdout_file])
35 |         self.stderr_writer = DuplicateWriter([sys.stderr, self.stderr_file])
36 | 
37 |     def __enter__(self):
38 |         sys.stdout = self.stdout_writer
39 |         sys.stderr = self.stderr_writer
40 | 
41 |     def __exit__(self, exc_type, exc, exc_tb):
42 |         sys.stdout = self.stdout
43 |         sys.stderr = self.stderr
44 |         if exc_type is not None:
45 |             self.stderr_writer.write(traceback.format_exc())
46 |         self.stderr_writer.flush()
47 |         self.stdout_writer.flush()
48 |         self.stderr_file.close()
49 |         self.stdout_file.close()
50 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from pathlib import Path
 3 | 
 4 | VERSION = "0.0.1"
 5 | DESCRIPTION = "NetShare"
 6 | this_directory = Path(__file__).parent
 7 | LONG_DESCRIPTION = (this_directory / "README.md").read_text()
 8 | 
 9 | # Setting up
10 | setup(
11 |     # the name must match the folder name 'verysimplemodule'
12 |     name="netshare",
13 |     version=VERSION,
14 |     author="Yucheng Yin, Zinan Lin, Minhao Jin, Giulia Fanti, Vyas Sekar",
15 |     author_email="yyin4@andrew.cmu.edu",
16 |     description=DESCRIPTION,
17 |     long_description=LONG_DESCRIPTION,
18 |     long_description_content_type="text/markdown",
19 |     packages=find_packages(),
20 |     install_requires=[
21 |         "torch",
22 |         "tensorboard",
23 |         "opacus",
24 |         "tqdm",
25 |         "matplotlib",
26 |         "pandas",
27 |         "scikit-learn",
28 |         "more-itertools",
29 |         "gensim==3.8.3",
30 |         "networkx",
31 |         "notebook",
32 |         "ipyplot",
33 |         "jupyterlab",
34 |         "statsmodels",
35 |         "gdown",
36 |         "annoy==1.17.1",
37 |         "pyshark",
38 |         "scapy",
39 |         "ray",
40 |         "ray[default]",
41 |         "multiprocess",
42 |         "addict",
43 |         "config_io==0.4.0",
44 |         "flask",
45 |     ],  # add any additional packages that
46 |     # needs to be installed along with your package. Eg: 'caer'
47 |     keywords=["python", "netshare"],
48 |     classifiers=[
49 |         "Development Status :: 3 - Alpha",
50 |         "Intended Audience :: Education",
51 |         "Programming Language :: Python :: 3.9",
52 |         "Operating System :: MacOS :: MacOS X",
53 |         "Operating System :: POSIX :: Linux"
54 |     ],
55 | )
56 | 


--------------------------------------------------------------------------------
/traces/README.md:
--------------------------------------------------------------------------------
 1 | Please download the all the  example datasets [here](https://drive.google.com/drive/folders/1FOl1VMr0tXhzKEOupxnJE9YQ2GwfX2FD?usp=sharing).
 2 | 
 3 | ---
 4 | 
 5 | # Dataset description
 6 | 
 7 | Three NetFlow datasets:  Netflow data has the following schema TBD
 8 | 
 9 | 1. [UGR16](https://nesg.ugr.es/nesg-ugr16/) dataset consists of traffic (including attacks) from NetFlow v9 collectors in a Spanish ISP network. We used data from the third week of March 2016. 
10 | 2. [CIDDS](https://www.hs-coburg.de/forschung/forschungsprojekte-oeffentlich/informationstechnologie/cidds-coburg-intrusion-detection-data-sets.html) dataset emulates a small business environment with several clients and servers (e.g., email, web) with injected malicious traffic was executed. Each NetFlow entry recorded with the label (benign/attack) and attack type (DoS, brute force, port scan). 
11 | 3. [TON](https://research.unsw.edu.au/projects/toniot-datasets) dataset represents telemetry IoT sensors. We use a sub-dataset (“Train_Test_datasets”) for evaluating cybersecurity-related ML algorithms; of its 461,013 records, 300,000 (65.07%) are normal, and the rest (34.93%) combine nine evenly-distributed attack types (e.g., backdoor, DDoS, injection, MITM).
12 | 
13 | Three PCAP datasets:  
14 | 
15 | 1. [CAIDA](https://www.caida.org/catalog/datasets/passive_dataset/) contains anonymized traces from high-speed monitors on a commercial backbone link. Our subset is from the New York collector in March 2018. (**Require an CAIDA account to download the data**)
16 | 2. [DC](https://pages.cs.wisc.edu/~tbenson/IMC10_Data.html) dataset is a packet capture from the "UNI1" data center studied in the [IMC 2010 paper](https://pages.cs.wisc.edu/~tbenson/papers/imc192.pdf).
17 | 3. [CA](https://www.netresec.com/?page=MACCDC) dataset is traces from The U.S. National CyberWatch Mid-Atlantic Collegiate Cyber Defense Competitions from March 2012.
18 | 
19 | Zeek: Zeek logs  have the following schema TBD
20 | 
21 | Wikipedia: The wikipedia web page view logs  have the following schema TBD
22 | 


--------------------------------------------------------------------------------
/traces/caida-small/raw.pcap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netsharecmu/NetShare/af026037a88db486069209e2258e11c2df1b93e2/traces/caida-small/raw.pcap


--------------------------------------------------------------------------------
/util/README.md:
--------------------------------------------------------------------------------
 1 | # Cluster prerequisite
 2 | 1. Nodes mounted with nfs.
 3 | 2. Nodes can communicate with each other using ssh in normal user mode.
 4 | 
 5 | # Ray cluster setup
 6 | 
 7 | Launch ray cluster (Run this command on your own laptop or on the host in cluster)
 8 | 
 9 | When launching ray cluster on your own laptop or in the cluster, please make sure to activate the conda environment "NetShare".
10 | 
11 | 
12 | ```bash
13 | # Change the host and workers ip in example.yaml
14 | (NetShare) node1:/nfs/NetShare-dev$ export LC_ALL=C.UTF-8
15 | (NetShare) node1:/nfs/NetShare-dev$ ray up ray/example.yaml
16 | ```
17 | 
18 | If launching the cluster from the cluster returns error like the following `FileNotFoundError: [Errno 2] No such file or directory: '/tmp/ray/cluster-test.lock'` it may be a bug.
19 | 
20 | Solution is:
21 | 
22 | ```bash
23 | # Change the host and workers ip in example.yaml
24 | (NetShare) node1:/nfs/NetShare-dev$ export LC_ALL=C.UTF-8
25 | (NetShare) node1:/nfs/NetShare-dev$ ray start --head
26 | (NetShare) node1:/nfs/NetShare-dev$ ray stop
27 | (NetShare) node1:/nfs/NetShare-dev$ ray up ray/example.yaml
28 | ```
29 | 
30 | Check if the ray cluster has been launched successfully.
31 | ``` bash
32 | (NetShare) node1:/nfs/NetShare-dev$ ray status
33 | 
34 | ======== Autoscaler status: 2022-07-23 10:08:03.979944 ========
35 | Node status
36 | ---------------------------------------------------------------
37 | Healthy:
38 |  4 local.cluster.node
39 | Pending:
40 |  (no pending nodes)
41 | Recent failures:
42 |  (no failures)
43 | 
44 | Resources
45 | ---------------------------------------------------------------
46 | Usage:
47 |  0.0/160.0 CPU
48 |  0.00/513.323 GiB memory
49 |  0.14/223.987 GiB object_store_memory
50 | 
51 | Demands:
52 |  (no resource demands)
53 | ```
54 | 
55 | Or 
56 | 
57 | ``` bash
58 | (NetShare) node1:/nfs/NetShare-dev$ python3 ray/check_nodes.py
59 | 
60 | Python version
61 | 3.6.13 |Anaconda, Inc.| (default, Jun  4 2021, 14:25:59)
62 | [GCC 7.5.0]
63 | {'128.105.144.191', '128.105.144.190', '128.105.144.179', '128.105.144.199'}
64 | [{'NodeID': '58433beab7f1653cde1324b9a6764596fb0ef534eaf2182946ef28a4', 'Alive': True, 'NodeManagerAddress': '128.105.144.199', 'NodeManagerHostname': 'node4.env-test.cloudmigration-pg0.wisc.cloudlab.us', 'NodeManagerPort': 42365, 'ObjectManagerPort': 34065, 'ObjectStoreSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/raylet', 'MetricsExportPort': 50427, 'NodeName': '128.105.144.199', 'alive': True, 'Resources': {'object_store_memory': 60156551577.0, 'node:128.105.144.199': 1.0, 'memory': 140365287015.0, 'CPU': 40.0}}, {'NodeID': '315f7a09c9e7633d7e6119730004188116696c069a463472671018c5', 'Alive': True, 'NodeManagerAddress': '128.105.144.191', 'NodeManagerHostname': 'node3.env-test.cloudmigration-pg0.wisc.cloudlab.us', 'NodeManagerPort': 36329, 'ObjectManagerPort': 41259, 'ObjectStoreSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/raylet', 'MetricsExportPort': 58422, 'NodeName': '128.105.144.191', 'alive': True, 'Resources': {'CPU': 40.0, 'memory': 140363113677.0, 'object_store_memory': 60155620147.0, 'node:128.105.144.191': 1.0}}, {'NodeID': '30a870e576b48152b1150ca7d026ad9d51a16377121ad494355e7f76', 'Alive': True, 'NodeManagerAddress': '128.105.144.190', 'NodeManagerHostname': 'node2.env-test.cloudmigration-pg0.wisc.cloudlab.us', 'NodeManagerPort': 35237, 'ObjectManagerPort': 33677, 'ObjectStoreSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/raylet', 'MetricsExportPort': 56875, 'NodeName': '128.105.144.190', 'alive': True, 'Resources': {'object_store_memory': 60154269696.0, 'node:128.105.144.190': 1.0, 'memory': 140359962624.0, 'CPU': 40.0}}, {'NodeID': '3a36f6e72af22d38d74f353ef6daf44a02f25668875b528c462d2f17', 'Alive': True, 'NodeManagerAddress': '128.105.144.179', 'NodeManagerHostname': 'node1.env-test.cloudmigration-pg0.wisc.cloudlab.us', 'NodeManagerPort': 44069, 'ObjectManagerPort': 44719, 'ObjectStoreSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2022-07-22_12-34-30_640417_1124/sockets/raylet', 'MetricsExportPort': 65331, 'NodeName': '128.105.144.179', 'alive': True, 'Resources': {'object_store_memory': 60037563187.0, 'node:128.105.144.179': 1.0, 'CPU': 40.0, 'memory': 130087647437.0}}]
65 | ```
66 | 
67 | Check if dashboard has been launched successfully
68 | 
69 | dashboard: http://<host_public_ip>:8265/


--------------------------------------------------------------------------------
/util/grow-rootfs.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | # Ref: https://groups.google.com/g/powder-users/c/QQiQ4uWsFmI/m/4rPTmHt0BAAJ
  4 | # HOW TO RUN the script
  5 | # env RESIZEROOT=64 ./grow-rootfs.sh
  6 | 
  7 | #
  8 | # If user wants to resize the rootfs to the max, try to do that.
  9 | #
 10 | set -x
 11 | 
 12 | 
 13 | if [ `id -u` -ne 0 ] ; then
 14 |     echo "This script must be run as root" 1>&2
 15 |     exit 1
 16 | fi
 17 | 
 18 | if [ -z "$RESIZEROOT" ]; then
 19 |     echo "ERROR: must define RESIZEROOT to the new total size (GB) you want for the rootfs!"
 20 |     exit 0
 21 | fi
 22 | if [ -z "$IMPOTENT" ]; then
 23 |     IMPOTENT=0
 24 | fi
 25 | if [ -z "$NODELETE" ]; then
 26 |     NODELETE=0
 27 | fi
 28 | 
 29 | # Remove any existing temp files.
 30 | rm -fv /tmp/sfdisk.orig /tmp/sfdisk.new /tmp/sfdisk.new \
 31 |     /tmp/sfdisk.nextstart /tmp/sfdisk.parts-to-delete
 32 | 
 33 | # Find the root partition's parent disk.
 34 | eval `lsblk -n -P -b -o NAME,FSTYPE,MOUNTPOINT,PARTTYPE,PARTUUID,TYPE,PKNAME,SIZE | grep 'MOUNTPOINT="/"'`
 35 | ROOTPARENT=$PKNAME
 36 | ROOT=$NAME
 37 | if [ -z "$ROOTPARENT" -o -z "$ROOT" ]; then
 38 |     echo "ERROR: unable to find root device or root parent disk; aborting!"
 39 |     exit 1
 40 | fi
 41 | # Find root partition number.
 42 | ROOTPARTNO=`echo "$ROOT" | sed -ne "s/^${ROOTPARENT}p\([0-9]*\)$/\1/p"`
 43 | if [ ! $? -eq 0 -o -z "$ROOTPARTNO" ]; then
 44 |     ROOTPARTNO=`echo "$NAME" | sed -ne "s/^${ROOTPARENT}\([0-9]*\)$/\1/p"`
 45 | fi
 46 | if [ -z "$ROOTPARTNO" ]; then
 47 |     echo "ERROR: could not determine the root partition number; aborting!"
 48 |     exit 1
 49 | fi
 50 | 
 51 | # Save off the original partition table, and create a new one to modify.
 52 | sfdisk -d /dev/$ROOTPARENT > /tmp/sfdisk.orig
 53 | cp -p /tmp/sfdisk.orig /tmp/sfdisk.new
 54 | 
 55 | if [ $NODELETE -eq 0 ]; then
 56 |     # Swapoff all swap devices if we are not impotent; they will be
 57 |     # removed.
 58 |     for dev in `blkid -t TYPE=swap | cut -d: -f1 | xargs` ; do
 59 | 	if [ ! $IMPOTENT -eq 1 ]; then
 60 | 	    swapoff $dev
 61 | 	    if [ ! $? -eq 0 ]; then
 62 | 		echo "ERROR: failed to swapoff $dev; aborting!"
 63 | 		exit 1
 64 | 	    fi
 65 | 	fi
 66 |     done
 67 | 
 68 |     # Figure out which partitions to remove.  We remove any partition on
 69 |     # the rootparent with FSTYPE="" and MOUNTPOINT="" and
 70 |     # PARTUUID=(0fc63daf-8483-4772-8e79-3d69d8477de4|00000000-0000-0000-0000-000000000000|0657FD6D-A4AB-43C4-84E5-0933C84B4F4F|0x83|0x82|0x0).
 71 | 
 72 |     PARTS=""
 73 |     lsblk -a -n -P -b -o NAME,FSTYPE,MOUNTPOINT,PARTTYPE,PARTUUID,TYPE,PKNAME,SIZE | grep "PKNAME=\"${ROOTPARENT}\"" | while read line ; do
 74 | 	eval "$line"
 75 | 	if [ "$FSTYPE" != swap -a \( -n "$FSTYPE" -o -n "$MOUNTPOINT" \) ]; then
 76 | 	    continue
 77 | 	fi
 78 | 	echo "$PARTTYPE" | grep -qEi '^(0fc63daf-8483-4772-8e79-3d69d8477de4|00000000-0000-0000-0000-000000000000|0657FD6D-A4AB-43C4-84E5-0933C84B4F4F|0x83|0x82|0x0)$'
 79 | 	if [ ! $? -eq 0 ]; then
 80 | 	    continue
 81 | 	fi
 82 | 	# Now extract the partition number (to feed to parted).  Partition
 83 | 	# number is not reported by most Linux tools nor by sysfs, so we
 84 | 	# have to extract via regexp.  Right now we only worry about nvme
 85 | 	# devices (or any device that ends with a "p\d+"), and assume that
 86 | 	# anything else is "standard".
 87 | 	PARTNO=`echo "$NAME" | sed -ne "s/^${PKNAME}p\([0-9]*\)$/\1/p"`
 88 | 	if [ ! $? -eq 0 -o -z "$PARTNO" ]; then
 89 | 	    PARTNO=`echo "$NAME" | sed -ne "s/^${PKNAME}\([0-9]*\)$/\1/p"`
 90 | 	fi
 91 | 	if [ ! $? -eq 0 -o -z "$PARTNO" ]; then
 92 | 	    continue
 93 | 	fi
 94 | 	PARTS="$PARTNO $PARTS"
 95 | 	echo $PARTNO >> /tmp/sfdisk.parts-to-delete
 96 |     done
 97 | 
 98 |     if [ -e /tmp/sfdisk.parts-to-delete ]; then
 99 | 	PARTS=`cat /tmp/sfdisk.parts-to-delete | xargs`
100 | 	rm -f /tmp/sfdisk.tmp
101 | 	cat /tmp/sfdisk.new | while read line ; do
102 | 	    delete=0
103 | 	    for part in $PARTS ; do
104 | 		echo "$line" | grep -q "^/dev/${ROOTPARENT}$part :"
105 | 		if [ $? -eq 0 ]; then
106 | 		    delete=1
107 | 		    break
108 | 		fi
109 | 	    done
110 | 	    if [ $delete -eq 0 ]; then
111 | 		echo "$line" >> /tmp/sfdisk.tmp
112 | 	    fi
113 | 	done
114 | 	diff -u /tmp/sfdisk.new /tmp/sfdisk.tmp
115 | 	mv /tmp/sfdisk.tmp /tmp/sfdisk.new
116 |     fi
117 | fi
118 | 
119 | #
120 | # Now we need to figure out the max sector we can end on.  If there is a
121 | # partition further up the disk, we can't stomp it.
122 | #
123 | DISKSIZE=`sfdisk -l /dev/$ROOTPARENT | sed -ne 's/^Disk.*, \([0-9]*\) sectors$/\1/p'`
124 | ROOTSTART=`sfdisk -l -o device,start,end /dev/$ROOTPARENT | sed -ne "s|/dev/${ROOT} *\([0-9]*\) *\([0-9]*\)$|\1|p"`
125 | ROOTEND=`sfdisk -l -o device,start,end /dev/$ROOTPARENT | sed -ne "s|/dev/${ROOT} *\([0-9]*\) *\([0-9]*\)$|\2|p"`
126 | ROOTSIZE=`expr $ROOTEND - $ROOTSTART + 1`
127 | # First, we find the max size of the new root partition in sectors.  If
128 | # we find a partition with a start greater than ROOTEND, that value -
129 | # 2048 is the new end.  Otherwise, it is DISKSIZE - 2048.
130 | nextstart=$DISKSIZE
131 | cat /tmp/sfdisk.new | grep "^/dev" | while read line ; do
132 |     nstart=`echo $line | sed -ne "s|/dev/[^ ]* *: *start= *\([0-9]*\),.*$|\1|p"`
133 |     if [ -z "$nstart" ] ; then
134 | 	continue
135 |     fi
136 |     if [ $nstart -gt $ROOTSTART -a $nstart -lt $nextstart ]; then
137 | 	nextstart=$nstart
138 | 	echo $nextstart > /tmp/sfdisk.nextstart
139 |     fi
140 | done
141 | if [ -e /tmp/sfdisk.nextstart -a -s /tmp/sfdisk.nextstart ]; then
142 |     nextstart=`cat /tmp/sfdisk.nextstart`
143 | fi
144 | align=0
145 | if [ ! `expr $nextstart \% 2048` -eq 0 ]; then
146 |     align=2048
147 | fi
148 | maxsize=`expr $nextstart - $align - $ROOTSTART`
149 | # Sanitize the size.  We only support GB.
150 | RESIZEROOT=`echo "$RESIZEROOT" | sed -ne 's/^\([0-9]*\)[^0-9]*$/\1/p'`
151 | if [ -z "$RESIZEROOT" ]; then
152 |     echo "ERROR: could not determine size of root disk $ROOTPARENT; aborting!"
153 |     exit 1
154 | fi
155 | if [ $RESIZEROOT -eq 0 ]; then
156 |     newsize=$maxsize
157 | else
158 |     usersectors=`expr $RESIZEROOT \* 1024 \* 1024 \*  1024 / 512`
159 |     if [ $usersectors -gt $maxsize ]; then
160 | 	newsize=$maxsize
161 |     else
162 | 	newsize=$usersectors
163 |     fi
164 | fi
165 | if [ -z "$newsize" ]; then
166 |     echo "ERROR: failed to calculate new root partition size; aborting!"
167 |     exit 1
168 | fi
169 | 
170 | 
171 | 
172 | if [ $newsize -eq $ROOTSIZE ]; then
173 |     echo "Nothing to do: newsize ($newsize) same as current root size ($ROOTSIZE)."
174 |     exit 0
175 | fi
176 | 
177 | if [ $newsize -lt $ROOTSIZE ]; then
178 |     echo "ERROR: newsize ($newsize) less than current root size ($ROOTSIZE); aborting!"
179 |     exit 1
180 | fi
181 | 
182 | 
183 | if [ $newsize -lt 2048 ]; then
184 |     echo "WARNING: cannot expand root partition; skipping!"
185 |     exit 0
186 | fi
187 | 
188 | # Finally, edit the sfdisk.new file to change the root device's size.
189 | cat /tmp/sfdisk.new | while read line ; do
190 |     echo "$line" | grep -q "^/dev/${ROOT} :"
191 |     if [ $? -eq 0 ]; then
192 | 	echo "$line" | sed -e "s|^\(/dev/${ROOT} :.*\)\(size= *[0-9]*,\)\(.*\)$|\1size=${newsize}\3|" >> /tmp/sfdisk.tmp
193 |     else
194 | 	echo "$line" >> /tmp/sfdisk.tmp
195 |     fi
196 | done
197 | mv /tmp/sfdisk.tmp /tmp/sfdisk.new
198 | 
199 | diff -u /tmp/sfdisk.orig /tmp/sfdisk.new
200 | 
201 | if [ $IMPOTENT -eq 1 ]; then
202 |     exit 0
203 | fi
204 | 
205 | sfdisk --force /dev/$ROOTPARENT < /tmp/sfdisk.new
206 | partprobe /dev/$ROOTPARENT
207 | resize2fs /dev/$ROOT
208 | if [ ! $? -eq 0 ]; then
209 |     echo "ERROR: failed to resize /dev/$ROOT filesystem; aborting!"
210 |     exit 1
211 | fi
212 | 
213 | echo "Resized /dev/$ROOT."
214 | 
215 | exit 0
216 | 


--------------------------------------------------------------------------------
/util/ray/check_nodes.py:
--------------------------------------------------------------------------------
 1 | import ray
 2 | import sys
 3 | 
 4 | ray.init(address="auto")
 5 | # ray.init(address='ray://128.105.144.254:10001')
 6 | 
 7 | import time
 8 | 
 9 | @ray.remote
10 | def f():
11 |     time.sleep(0.01)
12 |     ip = ray._private.services.get_node_ip_address()
13 |     # f = open(f"/nfs/ray-test/node{ip}.txt", "a")
14 |     # f.write("1")
15 |     # f.close()
16 |     return ip
17 | 
18 | # Get a list of the IP addresses of the nodes that have joined the cluster.
19 | print("Python version")
20 | print(sys.version)
21 | print(set(ray.get([f.remote() for _ in range(100000)])))
22 | print(ray.nodes())
23 | 


--------------------------------------------------------------------------------
/util/ray/example.yaml:
--------------------------------------------------------------------------------
  1 | # A unique identifier for the head node and workers of this cluster.
  2 | cluster_name: test
  3 | 
  4 | provider:
  5 |     type: local
  6 |     head_ip: nfs.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us
  7 |     # You may need to supply a public ip for the head node if you need
  8 |     # to run `ray up` from outside of the Ray cluster's network
  9 |     # (e.g. the cluster is in an AWS VPC and you're starting ray from your laptop)
 10 |     # This is useful when debugging the local node provider with cloud VMs.
 11 |     # external_head_ip: YOUR_HEAD_PUBLIC_IP
 12 |     worker_ips: [node1.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node2.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node3.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node4.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node5.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node6.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node7.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node8.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node9.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node10.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us,node11.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node12.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node13.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node14.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node15.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node16.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node17.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node18.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node19.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us, node20.netshare-wisc.cloudmigration-pg0.wisc.cloudlab.us]
 13 |     # worker_ips: [10.10.1.2, 10.10.1.3, 10.10.1.4, 10.10.1.5, 10.10.1.6, 10.10.1.7, 10.10.1.8, 10.10.1.9, 10.10.1.10, 10.10.1.11]
 14 |     # worker_ips: []
 15 |     # Optional when running automatic cluster management on prem. If you use a coordinator server,
 16 |     # then you can launch multiple autoscaling clusters on the same set of machines, and the coordinator
 17 |     # will assign individual nodes to clusters as needed.
 18 |     #    coordinator_address: "<host>:<port>"
 19 | 
 20 | # How Ray will authenticate with newly launched nodes.
 21 | auth:
 22 |     ssh_user: yyucheng
 23 |     # You can comment out `ssh_private_key` if the following machines don't need a private key for SSH access to the Ray
 24 |     # cluster:
 25 |     #   (1) The machine on which `ray up` is executed.
 26 |     #   (2) The head node of the Ray cluster.
 27 |     #
 28 |     # The machine that runs ray up executes SSH commands to set up the Ray head node. The Ray head node subsequently
 29 |     # executes SSH commands to set up the Ray worker nodes. When you run ray up, ssh credentials sitting on the ray up
 30 |     # machine are copied to the head node -- internally, the ssh key is added to the list of file mounts to rsync to head node.
 31 |     ssh_private_key: ~/.ssh/id_rsa
 32 | 
 33 | # The minimum number of workers nodes to launch in addition to the head
 34 | # node. This number should be >= 0.
 35 | # Typically, min_workers == max_workers == len(worker_ips).
 36 | # This field is optional.
 37 | # min_workers: 1
 38 | 
 39 | # The maximum number of workers nodes to launch in addition to the head node.
 40 | # This takes precedence over min_workers.
 41 | # Typically, min_workers == max_workers == len(worker_ips).
 42 | # This field is optional.
 43 | # max_workers: 1
 44 | # The default behavior for manually managed clusters is
 45 | # min_workers == max_workers == len(worker_ips),
 46 | # meaning that Ray is started on all available nodes of the cluster.
 47 | # For automatically managed clusters, max_workers is required and min_workers defaults to 0.
 48 | 
 49 | # The autoscaler will scale up the cluster faster with higher upscaling speed.
 50 | # E.g., if the task requires adding more nodes then autoscaler will gradually
 51 | # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
 52 | # This number should be > 0.
 53 | # upscaling_speed: 1.0
 54 | 
 55 | # idle_timeout_minutes: 5
 56 | 
 57 | # Files or directories to copy to the head and worker nodes. The format is a
 58 | # dictionary from REMOTE_PATH: LOCAL_PATH. E.g. you could save your conda env to an environment.yaml file, mount
 59 | # that directory to all nodes and call `conda -n my_env -f /path1/on/remote/machine/environment.yaml`. In this
 60 | # example paths on all nodes must be the same (so that conda can be called always with the same argument)
 61 | file_mounts: {
 62 | #    "/path1/on/remote/machine": "/path1/on/local/machine",
 63 | #    "/path2/on/remote/machine": "/path2/on/local/machine",
 64 |     # "~": "../NetShare-dev",
 65 | }
 66 | 
 67 | # Files or directories to copy from the head node to the worker nodes. The format is a
 68 | # list of paths. The same path on the head node will be copied to the worker node.
 69 | # This behavior is a subset of the file_mounts behavior. In the vast majority of cases
 70 | # you should just use file_mounts. Only use this if you know what you're doing!
 71 | # cluster_synced_files: ["~/test.txt"]
 72 | 
 73 | # Whether changes to directories in file_mounts or cluster_synced_files in the head node
 74 | # should sync to the worker node continuously
 75 | # file_mounts_sync_continuously: False
 76 | 
 77 | # Patterns for files to exclude when running rsync up or rsync down
 78 | # rsync_exclude:
 79 | #     - "**/.git"
 80 | #     - "**/.git/**"
 81 | 
 82 | # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
 83 | # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
 84 | # as a value, the behavior will match git's behavior for finding and using .gitignore files.
 85 | # rsync_filter:
 86 | #     - ".gitignore"
 87 | 
 88 | # # List of commands that will be run before `setup_commands`. If docker is
 89 | # # enabled, these commands will run outside the container and before docker
 90 | # # is setup.
 91 | # initialization_commands: []
 92 | 
 93 | # # List of shell commands to run to set up each nodes.
 94 | setup_commands: [conda activate NetShare]
 95 | #     # If we have e.g. conda dependencies stored in "/path1/on/local/machine/environment.yaml", we can prepare the
 96 | #     # work environment on each worker by:
 97 | #     #   1. making sure each worker has access to this file i.e. see the `file_mounts` section
 98 | #     #   2. adding a command here that creates a new conda environment on each node or if the environment already exists,
 99 | #     #     it updates it:
100 | #     #      conda env create -q -n my_venv -f /path1/on/local/machine/environment.yaml || conda env update -q -n my_venv -f /path1/on/local/machine/environment.yaml
101 | #     #
102 | #     # Ray developers:
103 | #     # you probably want to create a Docker image that
104 | #     # has your Ray repo pre-cloned. Then, you can replace the pip installs
105 | #     # below with a git checkout <your_sha> (and possibly a recompile).
106 | #     # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
107 | #     # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
108 | #     # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
109 | 
110 | # # Custom commands that will be run on the head node after common setup.
111 | head_setup_commands: [echo "hello from the head node"]
112 | 
113 | # # Custom commands that will be run on worker nodes after common setup.
114 | worker_setup_commands: [echo "hello from worker nodes"]
115 | 
116 | # Command to start ray on the head node. You don't need to change this.
117 | head_start_ray_commands:
118 |   # If we have e.g. conda dependencies, we could create on each node a conda environment (see `setup_commands` section).
119 |   # In that case we'd have to activate that env on each node before running `ray`:
120 |   - conda activate NetShare && export LC_ALL=C.UTF-8 && ray stop
121 |   - conda activate NetShare && export LC_ALL=C.UTF-8 && ulimit -c unlimited && ray start --head --port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml --include-dashboard=True --dashboard-host=0.0.0.0 --dashboard-port=8265 --temp-dir=/users/yyucheng/tmp
122 |   # --system-config="{\"kill_idle_workers_interval_ms\":10}"
123 |     # - ray stop
124 |     # - ulimit -c unlimited && ray start --head --port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml
125 | 
126 | # Command to start ray on worker nodes. You don't need to change this.
127 | worker_start_ray_commands:
128 |   # If we have e.g. conda dependencies, we could create on each node a conda environment (see `setup_commands` section).
129 |   # In that case we'd have to activate that env on each node before running `ray`:
130 |   - conda activate NetShare && export LC_ALL=C.UTF-8 && ray stop
131 |   - conda activate NetShare && export LC_ALL=C.UTF-8 && ray start --address=$RAY_HEAD_IP:6379
132 |     # - ray stop
133 |     # - ray start --address=$RAY_HEAD_IP:6379


--------------------------------------------------------------------------------
/util/setup-cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd $HOME
 3 | 
 4 | VIRTUAL_ENV=$1
 5 | USERNAME=$2
 6 | CONDA_EXEC=$HOME/anaconda3/bin/conda
 7 | NETSHARE_LOCAL_REPO=/nfs/NetShare
 8 | 
 9 | # Anaconda3
10 | if [ -f $CONDA_EXEC ] 
11 | then
12 |     echo "Anaconda3 installed."
13 | else
14 |     echo "Anaconda3 not installed. Start installation now..."
15 |     wget https://repo.anaconda.com/archive/Anaconda3-2022.05-Linux-x86_64.sh
16 |     bash Anaconda3-2022.05-Linux-x86_64.sh -b -p $HOME/anaconda3
17 | fi
18 | eval "$($HOME/anaconda3/bin/conda shell.bash hook)"
19 | conda init
20 | 
21 | # create virtual environment if not exists
22 | if ! { conda env list | grep $VIRTUAL_ENV; } >/dev/null 2>&1
23 | then
24 |     echo "Conda environment $VIRTUAL_ENV not installed."
25 |     conda create -y --name $VIRTUAL_ENV python=3.6
26 | else
27 |     echo "Conda environment $VIRTUAL_ENV installed."
28 | fi
29 | source $HOME/anaconda3/etc/profile.d/conda.sh
30 | conda activate $VIRTUAL_ENV
31 | 
32 | # If already cloned
33 | if ! [ -d $NETSHARE_LOCAL_REPO]
34 | then
35 |     echo "git clone from remote repo..."
36 |     git clone https://github.com/netsharecmu/NetShare.git $NETSHARE_LOCAL_REPO
37 | else
38 |     echo "$NETSHARE_LOCAL_REPO exists! Skip git clone..."
39 | fi
40 | 
41 | export LC_ALL=C.UTF-8
42 | export LANG=C.UTF-8
43 | ray start --head && ray stop
44 | 
45 | cd $NETSHARE_LOCAL_REPO
46 | pip3 install -e .


--------------------------------------------------------------------------------
/util/setup_node_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | USER=yyucheng
 4 | NUMHOSTS=21
 5 | EXPERIMENTNAME=netshare-wisc
 6 | PROJECTNAME=cloudmigration-pg0
 7 | # LOCATION=utah
 8 | LOCATION=wisc
 9 | # LOCATION=clemson
10 | SITE=cloudlab.us
11 | 
12 | pids=()
13 | 
14 | # setup controller
15 | NODE_SYSTEM="${USER}@nfs.${EXPERIMENTNAME}.${PROJECTNAME}.${LOCATION}.${SITE}"
16 | # NODE_SYSTEM="${USER}@nfs.${EXPERIMENTNAME}.cloudmigration.emulab.net"
17 | echo $NODE_SYSTEM
18 | ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $NODE_SYSTEM "sudo -n env RESIZEROOT=192 bash -s" < grow-rootfs.sh
19 | ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $NODE_SYSTEM "bash -s" < setup-cpu.sh "NetShare" & 
20 | scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ~/.ssh/netshare-package $NODE_SYSTEM:~/.ssh/id_rsa &
21 | pids+=($!)
22 | 
23 | # setup workers
24 | COUNTER=1
25 | while [  $COUNTER -lt $NUMHOSTS ]; do
26 |     NODE="node${COUNTER}" 
27 |     NODE_SYSTEM="${USER}@${NODE}.${EXPERIMENTNAME}.${PROJECTNAME}.${LOCATION}.${SITE}"
28 |     # NODE_SYSTEM="${USER}@${NODE}.${EXPERIMENTNAME}.cloudmigration.emulab.net"
29 |     echo $NODE_SYSTEM
30 | 
31 |     ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $NODE_SYSTEM "sudo -n env RESIZEROOT=192 bash -s" < grow-rootfs.sh
32 |     ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $NODE_SYSTEM "bash -s" < setup-cpu.sh "NetShare" & 
33 |     scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ~/.ssh/netshare-package $NODE_SYSTEM:~/.ssh/id_rsa &
34 | 
35 |     pids+=($!)
36 |     let COUNTER=COUNTER+1
37 | done
38 | 
39 | for pid in "${pids[@]}"; do
40 |     wait "$pid"
41 | done


--------------------------------------------------------------------------------