├── .env.example
├── .gitignore
├── README.md
├── analysis
    ├── million_scale.ipynb
    ├── million_scale.py
    ├── plot.ipynb
    ├── ripe_atlas_probes_bias.ipynb
    └── tables.ipynb
├── clickhouse_files
    ├── init-db.sh
    └── users.d
    │   └── default.xml
├── datasets
    └── create_datasets.ipynb
├── default.py
├── install.sh
├── logger.py
├── measurements
    ├── landmark_traceroutes.ipynb
    ├── million_scale_measurements.ipynb
    └── million_scale_measurements.py
├── poetry.lock
├── pyproject.toml
└── scripts
    ├── analysis
        └── analysis.py
    ├── ripe_atlas
        ├── atlas_api.py
        └── ping_and_traceroute_classes.py
    ├── street_level
        ├── landmark.py
        ├── three_tiers.py
        └── traceroutes_results.py
    └── utils
        ├── clickhouse.py
        ├── clickhouse_installer.py
        ├── credentials.py
        ├── file_utils.py
        ├── helpers.py
        ├── measurement_utils.py
        └── plot_utils.py


/.env.example:
--------------------------------------------------------------------------------
1 | RIPE_USERNAME=
2 | RIPE_SECRET_KEY=
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # output files
  2 | *.fsdb
  3 | *.pdf
  4 | *.csv
  5 | *.dat
  6 | *.tif
  7 | *.tree
  8 | *.zst
  9 | *.json
 10 | *.dat
 11 | *.txt
 12 | 
 13 | measurements/results
 14 | clickhouse_files/data/
 15 | clickhouse_files/logs/
 16 | clickhouse_files/clickhouse
 17 | 
 18 | # Byte-compiled / optimized / DLL files
 19 | __pycache__/
 20 | *.py[cod]
 21 | *$py.class
 22 | 
 23 | # C extensions
 24 | *.so
 25 | 
 26 | # Distribution / packaging
 27 | .Python
 28 | build/
 29 | develop-eggs/
 30 | dist/
 31 | downloads/
 32 | eggs/
 33 | .eggs/
 34 | lib/
 35 | lib64/
 36 | parts/
 37 | sdist/
 38 | var/
 39 | wheels/
 40 | share/python-wheels/
 41 | *.egg-info/
 42 | .installed.cfg
 43 | *.egg
 44 | MANIFEST
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .nox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | *.py,cover
 67 | .hypothesis/
 68 | .pytest_cache/
 69 | cover/
 70 | 
 71 | # Translations
 72 | *.mo
 73 | *.pot
 74 | 
 75 | # Django stuff:
 76 | *.log
 77 | local_settings.py
 78 | db.sqlite3
 79 | db.sqlite3-journal
 80 | 
 81 | # Flask stuff:
 82 | instance/
 83 | .webassets-cache
 84 | 
 85 | # Scrapy stuff:
 86 | .scrapy
 87 | 
 88 | # Sphinx documentation
 89 | docs/_build/
 90 | 
 91 | # PyBuilder
 92 | .pybuilder/
 93 | target/
 94 | 
 95 | # Jupyter Notebook
 96 | .ipynb_checkpoints
 97 | 
 98 | # IPython
 99 | profile_default/
100 | ipython_config.py
101 | 
102 | # pyenv
103 | #   For a library or package, you might want to ignore these files since the code is
104 | #   intended to run in multiple environments; otherwise, check them in:
105 | # .python-version
106 | 
107 | # pipenv
108 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
110 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
111 | #   install all needed dependencies.
112 | #Pipfile.lock
113 | 
114 | # poetry
115 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
116 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
117 | #   commonly ignored for libraries.
118 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
119 | #poetry.lock
120 | 
121 | # pdm
122 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
123 | #pdm.lock
124 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
125 | #   in version control.
126 | #   https://pdm.fming.dev/#use-with-ide
127 | .pdm.toml
128 | 
129 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
130 | __pypackages__/
131 | 
132 | # Celery stuff
133 | celerybeat-schedule
134 | celerybeat.pid
135 | 
136 | # SageMath parsed files
137 | *.sage.py
138 | 
139 | # Environments
140 | .env
141 | .venv
142 | env/
143 | venv/
144 | ENV/
145 | env.bak/
146 | venv.bak/
147 | 
148 | # Spyder project settings
149 | .spyderproject
150 | .spyproject
151 | 
152 | # Rope project settings
153 | .ropeproject
154 | 
155 | # mkdocs documentation
156 | /site
157 | 
158 | # mypy
159 | .mypy_cache/
160 | .dmypy.json
161 | dmypy.json
162 | 
163 | # Pyre type checker
164 | .pyre/
165 | 
166 | # pytype static type analyzer
167 | .pytype/
168 | 
169 | # Cython debug symbols
170 | cython_debug/
171 | 
172 | # PyCharm
173 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
176 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
177 | #.idea/
178 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | #  🗺️ Replication: Towards a Publicly Available Internet scale IP Geolocation Dataset (IMC 2023)
  2 | This repository contains the code needed to reproduce and replicate our results in our [IMC 2023 paper](). 
  3 | 
  4 | Our study replicates the methodology of two papers that obtained outstanding results on geolocating IP addresses in terms of coverage and accuracy in nowadays Internet on the largest publicly available measurement platform, RIPE Atlas. 
  5 | These two papers are: 
  6 | 
  7 | 1. [Towards geolocation of millions of IP addresses (IMC 2012)](https://dl.acm.org/doi/abs/10.1145/2398776.2398790)
  8 | 
  9 | 2. [Towards Street-Level Client-Independent IP Geolocation (NSDI 2011)](https://www.usenix.org/legacy/event/nsdi11/tech/full_papers/Wang_Yong.pdf).
 10 | 
 11 | They are called million scale and street level papers throughout this README, as done in our paper. 
 12 | 
 13 | Our code offers the possibility to: 
 14 | 1. reproduce our results using our measurement datasets.
 15 | 2. replicate our methodology with different targets and vantage points. For now, only RIPE Atlas vantage points are supported, but it should not be difficult to adapt the code to handle other vantage points and targets. 
 16 | 
 17 | ## Prerequisites
 18 | Our code performs measurements on RIPE Atlas, so be sure to have an account if you want to replicate our methodology with your own RIPE Atlas measurements.
 19 | 
 20 | ⚠️ **To replicate our RIPE Atlas measurements, you will need a lot of credits (millions)**. 
 21 | 
 22 | 
 23 | ## Table of contents
 24 | 
 25 | - [Installation](#installation)
 26 |   - [Requirements](#requirements)
 27 |   - [Download datasets](#download-datasets)
 28 |   - [Clone the repository](#clone-the-repository)
 29 |   - [Installer](#installer)
 30 |   - [Install source files](#install-source-files)
 31 |   - [Clickhouse](#clickhouse)
 32 |   - [Settings](#settings)
 33 |   - [Further notice](#further-notice)
 34 | - [Reproduction](#reproduction)
 35 | - [Run your own measurements](#run-your-own-measurements)
 36 | 
 37 | ## [Installation](#installation)
 38 | 
 39 | ### [Requirements](#requirements)
 40 | 
 41 | - [Python3.9](https://www.python.org/downloads/) (or above)
 42 | - [Poetry](https://python-poetry.org/docs/)
 43 | - [Docker](https://docs.docker.com/engine/install/)
 44 | 
 45 | 
 46 | ### [Download datasets](#download-datasets)
 47 | 
 48 | You can fetch our data our on FTP ftp.iris.dioptra.io that will give you the ClickHouse tables dumped in CSV format.   
 49 | 
 50 | ### [Clone the reprository](#clone-the-repository)
 51 | 
 52 | ```bash
 53 | git clone https://github.com/dioptra-io/geoloc-imc-2023.git
 54 | cd geoloc-imc-2023
 55 | ```
 56 | 
 57 | ### [Installer](#installer)
 58 | 
 59 | You can use the script **install.sh** to:
 60 | - Pull the clickhouse docker image.
 61 | - Start the clickhouse server.
 62 | - Download clickhouse-client binary.
 63 | - Install python project using poetry.
 64 | - Create all tables and populate the database with our measurements.
 65 | 
 66 | ```bash
 67 | source install.sh
 68 | ```
 69 | If the installation fails, all necessary steps to use the project are described below.
 70 | 
 71 | ### [Install source files](#install-source-files)
 72 | 
 73 | GeoScale uses poetry has dependency manager, install the project using:
 74 | ```bash
 75 | poetry shell
 76 | poetry lock
 77 | poetry install
 78 | ```
 79 | 
 80 | ### [Clickhouse](#clickhouse)
 81 | 
 82 | We use docker to run clickhouse server, by default server is listening on localhost on port 8123 and tcp9000. If you prefer using your own docker configuration, please also modify [default.py](default.py)
 83 | ```bash
 84 | 
 85 | # pull the docker image
 86 | docker pull clickhouse/clickhouse-server:22.6
 87 | 
 88 | # start the server
 89 | docker run --rm -d \
 90 |     -v ./clickhouse_files/data:/var/lib/clickhouse/ \
 91 |     -v ./clickhouse_files/logs:/var/log/clickhouse-server/ \
 92 |     -v ./clickhouse_files/users.d:/etc/clickhouse-server/users.d:ro \
 93 |     -v ./clickhouse_files/init-db.sh:/docker-entrypoint-initdb.d/init-db.sh \
 94 |     -p 8123:8123 \
 95 |     -p 9000:9000 \
 96 |     --ulimit nofile=262144:262144 \
 97 |     clickhouse/clickhouse-server:22.6
 98 | ```
 99 | 
100 | You can either install [clickhouse-client](https://clickhouse.com/docs/en/install) or download clikhouse client binary (by default, [install.sh](install.sh) download binary file).
101 | ```bash
102 | curl https://clickhouse.com/ | sh
103 | mv clickhouse ./clickhouse_files/
104 | ```
105 | 
106 | Finally, create all necessary tables and populate it with our own measurements with:
107 | ```bash
108 | python scripts/utils/clickhouse_installer.py 
109 | ```
110 | 
111 | 
112 | ### [Setttings](#settings)
113 | 
114 | Our tool relies on ENV variables for configuring clickhouse or interacting with RIPE Atlas API.
115 | An example of necessary ENV variables is given in [.env.example](.env.example). Create your own
116 | env file with following values:
117 | ```.env
118 | RIPE_USERNAME=
119 | RIPE_SECRET_KEY=
120 | ```
121 | 
122 | ⚠️ **IF** you used, your own clickhouse configuration, you can modify the following ENV:
123 | ```
124 | # clickhouse settings
125 | CLICKHOUSE_CLIENT=
126 | CLICKHOUSE_HOST=
127 | CLICKHOUSE_DB=
128 | CLICKHOUSE_USER=
129 | CLICKHOUSE_PASSWORD=
130 | ```
131 | ### [Further notice](#notice)
132 | 
133 | #### Test environment
134 | 
135 | The project has been run on:
136 | - CentOS 7.5
137 | - Python 3.9
138 | - Server with 64GB RAM, 32 cores.
139 |   
140 | ⚠️ Some scripts and analysis can use a lot of CPU and RAM (tens of GB) and last for hours.
141 | 
142 | 
143 | ## [Reproducing our results](#reproduction)
144 | 
145 | We provide python scripts and jupyter notebooks to reproduce the results and the graphs that we got in replicating the million scale and the street level papers.
146 | 
147 | ### Million Scale
148 | 
149 | You can reproduce Million scale results using a jupyter notebook: [million_scale.ipynb](./analysis/million_scale.ipynb)
150 | 
151 | Alternatively you can also use the python script in background, as some steps are vey long to execute (several hours):
152 | ```bash
153 | nohup python analysis/million_scale.py > output.log &
154 | ```
155 | 
156 | All analysis results can be found in **./analysis/results**
157 | 
158 | ### Street level
159 | 
160 | ⚠️ The tier 1 of the Street-level replication (See the paper for more details) relies on results calculated by the million scale technique. You need to run the million scale notebook/scripts **before** running those of street-level. 
161 | 
162 | No additional steps are necessary to reproduce the street-level experiment.
163 | 
164 | ### Generating figures
165 | 
166 | You can directly use notebooks [plot.ipynb](./analysis/plot.ipynb) and [tables.ipynb](./analysis/tables.ipynb) to produce the figures and tables of our paper. 
167 |  
168 | ## [Run your own measurements](#run-your-own-measurements)
169 | 
170 | You can also run your own measurements on custom datasets of targets (anchors) and vantage points (probes).
171 | 
172 | ### First step: generate targets and vantage points datasets
173 | 
174 | The jupyter notebook [create_dataset](./datasets/create_datasets.ipynb) will generate:
175 | - the set of probes (used as vantage points)
176 | - the set of anchors (used as targets)
177 | - filter both sets by removing problematic probes (wrongly geolocated for example)
178 | 
179 | All generated files will be placed in /datasets/user_datasets.
180 | 
181 | ### Second step: run measurements
182 | 
183 | With [million_scale_measurements.ipynb](./measurements/million_scale_measurements.ipynb), you can select a subset of vantage points and targets and run measurements on RIPE Atlas.
184 | 
185 | This script will start measurements for:
186 |   1. towards all targets from all vantage points
187 |   2. towards 3 responsive addresses for each target from all vantage points
188 | 
189 | ⚠️ These measurements might cost a lot of RIPE Atlas credits and time if you run them on large datasets (default is only 2 targets and 4 vantage points).
190 | 
191 | ### Third step: analyze your results
192 | 
193 | Perform the analysis by using the same step described previously on your own measurements results and datasets by setting the boolean variable ```repro = True```, at the beginning of [million_scale.ipynb](./analysis/million_scale.ipynb) (or [million_scale.py](./analysis/million_scale.py) if you are using the script).
194 | 
195 | 
196 | 
197 | TODO: Street level
198 | 
199 | ## 📚 Publications
200 | 
201 | ```bibtex
202 | @inproceedings{darwich2023replication,
203 |   title={Replication: Towards a Publicly Available Internet scale IP Geolocation Dataset},
204 |   author={Darwich, Omar and Rimlinger, Hugo and Dreyfus, Milo and Gouel, Matthieu and Vermeulen, Kevin},
205 |   booktitle={Proceedings of the 2023 ACM on Internet Measurement Conference},
206 |   pages={1--15},
207 |   year={2023}
208 | }
209 | ```
210 | 
211 | 
212 | ## 🧑‍💻 Authors
213 | 
214 | This project is the result of a collaboration between the [LAAS-CNRS](https://www.laas.fr/public/) and [Sorbonne Université](https://www.sorbonne-universite.fr/).
215 | 
216 | 


--------------------------------------------------------------------------------
/analysis/million_scale.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# First step of the analysis\n",
  8 |     "\n",
  9 |     "Preprocess results and save them before they can be plotted.  \n",
 10 |     "\n",
 11 |     "To do after measurements notebooks"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from scripts.utils.file_utils import load_json, dump_json\n",
 21 |     "\n",
 22 |     "from scripts.analysis.analysis import *\n",
 23 |     "from default import *\n",
 24 |     "\n",
 25 |     "# set to True to use your own datasets/measurements\n",
 26 |     "run_repro = False\n",
 27 |     "if run_repro:\n",
 28 |     "    # DATASET FILES\n",
 29 |     "    PROBES_FILE = REPRO_PROBES_FILE\n",
 30 |     "    PROBES_AND_ANCHORS_FILE = REPRO_PROBES_AND_ANCHORS_FILE\n",
 31 |     "    FILTERED_PROBES_FILE = REPRO_FILTERED_PROBES_FILE\n",
 32 |     "    GREEDY_PROBES_FILE = REPRO_GREEDY_PROBES_FILE\n",
 33 |     "    PAIRWISE_DISTANCE_FILE = REPRO_PAIRWISE_DISTANCE_FILE\n",
 34 |     "    VPS_TO_TARGET_TABLE = PROBES_TO_ANCHORS_PING_TABLE\n",
 35 |     "    VPS_TO_PREFIX_TABLE = PROBES_TO_PREFIX_TABLE\n",
 36 |     "\n",
 37 |     "    # RESULT FILES\n",
 38 |     "    PROBES_TO_ANCHORS_RESULT_FILE = REPRO_PROBES_TO_ANCHORS_RESULT_FILE\n",
 39 |     "    ROUND_BASED_ALGORITHM_FILE = REPRO_ROUND_BASED_ALGORITHM_FILE\n",
 40 |     "    ACCURACY_VS_N_VPS_PROBES_FILE = REPRO_ACCURACY_VS_N_VPS_PROBES_FILE\n",
 41 |     "    VP_SELECTION_ALGORITHM_PROBES_1_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_1_FILE\n",
 42 |     "    VP_SELECTION_ALGORITHM_PROBES_3_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_3_FILE\n",
 43 |     "    VP_SELECTION_ALGORITHM_PROBES_10_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_10_FILE\n",
 44 |     "    \n",
 45 |     "else:\n",
 46 |     "    # DATASET FILES\n",
 47 |     "    PROBES_FILE = USER_PROBES_FILE\n",
 48 |     "    PROBES_AND_ANCHORS_FILE = USER_PROBES_AND_ANCHORS_FILE\n",
 49 |     "    FILTERED_PROBES_FILE = USER_FILTERED_PROBES_FILE\n",
 50 |     "    GREEDY_PROBES_FILE = USER_GREEDY_PROBES_FILE\n",
 51 |     "    PAIRWISE_DISTANCE_FILE = USER_PAIRWISE_DISTANCE_FILE\n",
 52 |     "    VPS_TO_TARGET_TABLE = USER_VPS_TO_TARGET_TABLE\n",
 53 |     "    VPS_TO_PREFIX_TABLE = USER_VPS_TO_PREFIX_TABLE\n",
 54 |     "\n",
 55 |     "    # RESULT FILES\n",
 56 |     "    PROBES_TO_ANCHORS_RESULT_FILE = USER_PROBES_TO_ANCHORS_RESULT_FILE\n",
 57 |     "    ROUND_BASED_ALGORITHM_FILE = USER_ROUND_BASED_ALGORITHM_FILE\n",
 58 |     "    ACCURACY_VS_N_VPS_PROBES_FILE = USER_ACCURACY_VS_N_VPS_PROBES_FILE\n",
 59 |     "    VP_SELECTION_ALGORITHM_PROBES_1_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_1_FILE\n",
 60 |     "    VP_SELECTION_ALGORITHM_PROBES_3_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_3_FILE\n",
 61 |     "    VP_SELECTION_ALGORITHM_PROBES_10_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_10_FILE\n",
 62 |     "\n",
 63 |     "LIMIT = 1000"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 3,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "filtered_probes = load_json(FILTERED_PROBES_FILE)\n",
 73 |     "\n",
 74 |     "filter = \"\"\n",
 75 |     "if len(filtered_probes) > 0:\n",
 76 |     "    # Remove probes that are wrongly geolocated\n",
 77 |     "    in_clause = f\"\".join(\n",
 78 |     "        [f\",toIPv4('{p}')\" for p in filtered_probes])[1:]\n",
 79 |     "    filter += f\"AND dst not in ({in_clause}) AND src not in ({in_clause}) \"\n"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "## Compute errors\n",
 87 |     "\n",
 88 |     "Compute the median error between the guessed geolocations and the real geolocations"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 4,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n",
 98 |     "vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, vp_distance_matrix, probes_per_ip = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 5,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "rtt_per_srcs_dst = compute_rtts_per_dst_src(VPS_TO_TARGET_TABLE, filter, threshold=70)\n",
108 |     "\n",
109 |     "vps_per_target = {dst: set(vp_coordinates_per_ip.keys())\n",
110 |     "                    for dst in rtt_per_srcs_dst}\n",
111 |     "features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip, THRESHOLD_DISTANCES,\n",
112 |     "                                                vps_per_target=vps_per_target,\n",
113 |     "                                                distance_operator=\">\", max_vps=100000,\n",
114 |     "                                                is_use_prefix=False,\n",
115 |     "                                                vp_distance_matrix=vp_distance_matrix,\n",
116 |     "                                                )\n",
117 |     "\n",
118 |     "dump_json(features, PROBES_TO_ANCHORS_RESULT_FILE)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "## Round Algorithm\n",
126 |     "\n",
127 |     "First is to use a subset of greedy probes, and then take 1 probe/AS in the given CBG area to compute the median error."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 6,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n",
137 |     "\n",
138 |     "asn_per_vp_ip = {}\n",
139 |     "vp_coordinates_per_ip = {}\n",
140 |     "\n",
141 |     "for probe in all_probes:\n",
142 |     "    if \"address_v4\" in probe and \"geometry\" in probe and \"coordinates\" in probe[\"geometry\"]:\n",
143 |     "        ip_v4_address = probe[\"address_v4\"]\n",
144 |     "        if ip_v4_address is None:\n",
145 |     "            continue\n",
146 |     "        long, lat = probe[\"geometry\"][\"coordinates\"]\n",
147 |     "        asn_v4 = probe[\"asn_v4\"]\n",
148 |     "        asn_per_vp_ip[ip_v4_address] = asn_v4\n",
149 |     "        vp_coordinates_per_ip[ip_v4_address] = lat, long\n"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 7,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "# clickhouse is required here\n",
159 |     "rtt_per_srcs_dst = compute_rtts_per_dst_src(VPS_TO_TARGET_TABLE, filter, threshold=100)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 8,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "vp_distance_matrix = load_json(PAIRWISE_DISTANCE_FILE)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 9,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "name": "stdout",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "Using 10 tier1_vps\n",
181 |       "Using 100 tier1_vps\n",
182 |       "Using 300 tier1_vps\n",
183 |       "Using 500 tier1_vps\n",
184 |       "Using 1000 tier1_vps\n"
185 |      ]
186 |     }
187 |    ],
188 |    "source": [
189 |     "TIER1_VPS = [10, 100, 300, 500, 1000]\n",
190 |     "greedy_probes = load_json(GREEDY_PROBES_FILE)\n",
191 |     "error_cdf_per_tier1_vps = {}\n",
192 |     "for tier1_vps in TIER1_VPS:\n",
193 |     "    print(f\"Using {tier1_vps} tier1_vps\")\n",
194 |     "    error_cdf = round_based_algorithm(greedy_probes, rtt_per_srcs_dst, vp_coordinates_per_ip,\n",
195 |     "                                        asn_per_vp_ip,\n",
196 |     "                                        tier1_vps,\n",
197 |     "                                        threshold=40)\n",
198 |     "    error_cdf_per_tier1_vps[tier1_vps] = error_cdf\n",
199 |     "    \n",
200 |     "dump_json(error_cdf_per_tier1_vps, ROUND_BASED_ALGORITHM_FILE)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "## Accuracy vs number of vps probes\n",
208 |     "WARNING : Time consumming section  \n",
209 |     "\n",
210 |     "Compute median error for each target, depending on the number of initial VPs."
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 10,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n",
220 |     "\n",
221 |     "vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, \\\n",
222 |     "        vp_distance_matrix, probe_per_ip = compute_geo_info(\n",
223 |     "            all_probes, serialized_file=PAIRWISE_DISTANCE_FILE)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 12,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "name": "stderr",
233 |      "output_type": "stream",
234 |      "text": [
235 |       "2023-09-13 16:22:03::INFO:root:analysis:: Starting computing for random VPs 100\n",
236 |       "2023-09-13 16:23:13::INFO:root:analysis:: Starting computing for random VPs 200\n",
237 |       "2023-09-13 16:24:21::INFO:root:analysis:: Starting computing for random VPs 300\n",
238 |       "2023-09-13 16:25:31::INFO:root:analysis:: Starting computing for random VPs 400\n"
239 |      ]
240 |     }
241 |    ],
242 |    "source": [
243 |     "subset_sizes = []\n",
244 |     "subset_sizes.extend([i for i in range(100, 500, 100)])\n",
245 |     "# subset_sizes.extend([i for i in range(1000, 10001, 1000)])\n",
246 |     "\n",
247 |     "rtt_per_srcs_dst = compute_rtts_per_dst_src(VPS_TO_TARGET_TABLE, filter, threshold=50)\n",
248 |     "\n",
249 |     "available_vps = list(vp_coordinates_per_ip.keys())\n",
250 |     "accuracy_vs_nb_vps = compute_accuracy_vs_number_of_vps(available_vps, rtt_per_srcs_dst, vp_coordinates_per_ip,\n",
251 |     "                                    vp_distance_matrix, subset_sizes)\n",
252 |     "\n",
253 |     "dump_json(accuracy_vs_nb_vps, ACCURACY_VS_N_VPS_PROBES_FILE)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "## VPs selection algorithm\n",
261 |     "\n",
262 |     "Select respectively the 1, 3, and 10 closest probes (with minimal round trip time) for each target."
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 13,
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n",
272 |     "\n",
273 |     "vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, vp_distance_matrix, probes_per_ip = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 16,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "ping_table_prefix = VPS_TO_PREFIX_TABLE\n",
283 |     "ping_table = VPS_TO_TARGET_TABLE\n",
284 |     "N_VPS_SELECTION_ALGORITHM = [1, 3, 10]\n",
285 |     "results_files = [VP_SELECTION_ALGORITHM_PROBES_1_FILE, VP_SELECTION_ALGORITHM_PROBES_3_FILE, VP_SELECTION_ALGORITHM_PROBES_10_FILE]\n",
286 |     "\n",
287 |     "rtt_per_srcs_dst_prefix = compute_rtts_per_dst_src(ping_table_prefix, filter, threshold=100, is_per_prefix=True)\n",
288 |     "rtt_per_srcs_dst = compute_rtts_per_dst_src(ping_table, filter, threshold=70)\n",
289 |     "\n",
290 |     "for i, n_vp in enumerate(N_VPS_SELECTION_ALGORITHM):\n",
291 |     "    vps_per_target = compute_closest_rtt_probes(rtt_per_srcs_dst_prefix,\n",
292 |     "                                                    vp_coordinates_per_ip,\n",
293 |     "                                                    vp_distance_matrix,\n",
294 |     "                                                    n_shortest=n_vp,\n",
295 |     "                                                    is_prefix=True)\n",
296 |     "    features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip,\n",
297 |     "                                                    [0],\n",
298 |     "                                                    vps_per_target=vps_per_target,\n",
299 |     "                                                    distance_operator=\">\", max_vps=100000,\n",
300 |     "                                                    is_use_prefix=True,\n",
301 |     "                                                    vp_distance_matrix=vp_distance_matrix,\n",
302 |     "                                                    is_multiprocess=True)\n",
303 |     "    \n",
304 |     "    ofile = results_files[i]\n",
305 |     "    dump_json(features, ofile)"
306 |    ]
307 |   }
308 |  ],
309 |  "metadata": {
310 |   "kernelspec": {
311 |    "display_name": "review-fXCvvitn-py3.10",
312 |    "language": "python",
313 |    "name": "python3"
314 |   },
315 |   "language_info": {
316 |    "codemirror_mode": {
317 |     "name": "ipython",
318 |     "version": 3
319 |    },
320 |    "file_extension": ".py",
321 |    "mimetype": "text/x-python",
322 |    "name": "python",
323 |    "nbconvert_exporter": "python",
324 |    "pygments_lexer": "ipython3",
325 |    "version": "3.10.9"
326 |   },
327 |   "orig_nbformat": 4
328 |  },
329 |  "nbformat": 4,
330 |  "nbformat_minor": 2
331 | }
332 | 


--------------------------------------------------------------------------------
/analysis/million_scale.py:
--------------------------------------------------------------------------------
  1 | from scripts.utils.file_utils import load_json, dump_json
  2 | 
  3 | from scripts.analysis.analysis import *
  4 | from default import *
  5 | 
  6 | 
  7 | if __name__ == "__main__":
  8 |     # set to True to use your own datasets/measurements
  9 |     run_repro = True
 10 |     if run_repro:
 11 |         # DATASET FILES
 12 |         PROBES_FILE = REPRO_PROBES_FILE
 13 |         PROBES_AND_ANCHORS_FILE = REPRO_PROBES_AND_ANCHORS_FILE
 14 |         FILTERED_PROBES_FILE = REPRO_FILTERED_PROBES_FILE
 15 |         GREEDY_PROBES_FILE = REPRO_GREEDY_PROBES_FILE
 16 |         PAIRWISE_DISTANCE_FILE = REPRO_PAIRWISE_DISTANCE_FILE
 17 |         VPS_TO_TARGET_TABLE = PROBES_TO_ANCHORS_PING_TABLE
 18 |         VPS_TO_PREFIX_TABLE = PROBES_TO_PREFIX_TABLE
 19 | 
 20 |         # RESULT FILES
 21 |         PROBES_TO_ANCHORS_RESULT_FILE = REPRO_PROBES_TO_ANCHORS_RESULT_FILE
 22 |         ROUND_BASED_ALGORITHM_FILE = REPRO_ROUND_BASED_ALGORITHM_FILE
 23 |         ACCURACY_VS_N_VPS_PROBES_FILE = REPRO_ACCURACY_VS_N_VPS_PROBES_FILE
 24 |         VP_SELECTION_ALGORITHM_PROBES_1_FILE = (
 25 |             REPRO_VP_SELECTION_ALGORITHM_PROBES_1_FILE
 26 |         )
 27 |         VP_SELECTION_ALGORITHM_PROBES_3_FILE = (
 28 |             REPRO_VP_SELECTION_ALGORITHM_PROBES_3_FILE
 29 |         )
 30 |         VP_SELECTION_ALGORITHM_PROBES_10_FILE = (
 31 |             REPRO_VP_SELECTION_ALGORITHM_PROBES_10_FILE
 32 |         )
 33 | 
 34 |     else:
 35 |         # DATASET FILES
 36 |         PROBES_FILE = USER_PROBES_FILE
 37 |         PROBES_AND_ANCHORS_FILE = USER_PROBES_AND_ANCHORS_FILE
 38 |         FILTERED_PROBES_FILE = USER_FILTERED_PROBES_FILE
 39 |         GREEDY_PROBES_FILE = USER_GREEDY_PROBES_FILE
 40 |         PAIRWISE_DISTANCE_FILE = USER_PAIRWISE_DISTANCE_FILE
 41 |         VPS_TO_TARGET_TABLE = USER_VPS_TO_TARGET_TABLE
 42 |         VPS_TO_PREFIX_TABLE = USER_VPS_TO_PREFIX_TABLE
 43 | 
 44 |         # RESULT FILES
 45 |         PROBES_TO_ANCHORS_RESULT_FILE = USER_PROBES_TO_ANCHORS_RESULT_FILE
 46 |         ROUND_BASED_ALGORITHM_FILE = USER_ROUND_BASED_ALGORITHM_FILE
 47 |         ACCURACY_VS_N_VPS_PROBES_FILE = USER_ACCURACY_VS_N_VPS_PROBES_FILE
 48 |         VP_SELECTION_ALGORITHM_PROBES_1_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_1_FILE
 49 |         VP_SELECTION_ALGORITHM_PROBES_3_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_3_FILE
 50 |         VP_SELECTION_ALGORITHM_PROBES_10_FILE = (
 51 |             USER_VP_SELECTION_ALGORITHM_PROBES_10_FILE
 52 |         )
 53 | 
 54 |     LIMIT = 1000
 55 | 
 56 |     filtered_probes = load_json(FILTERED_PROBES_FILE)
 57 | 
 58 |     filter = ""
 59 |     if len(filtered_probes) > 0:
 60 |         # Remove probes that are wrongly geolocated
 61 |         in_clause = f"".join([f",toIPv4('{p}')" for p in filtered_probes])[1:]
 62 |         filter += f"AND dst not in ({in_clause}) AND src not in ({in_clause}) "
 63 | 
 64 |     logger.info("Step 1: Compute errors")
 65 | 
 66 |     all_probes = load_json(PROBES_AND_ANCHORS_FILE)
 67 |     (
 68 |         vp_coordinates_per_ip,
 69 |         ip_per_coordinates,
 70 |         country_per_vp,
 71 |         asn_per_vp,
 72 |         vp_distance_matrix,
 73 |         probes_per_ip,
 74 |     ) = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)
 75 | 
 76 |     rtt_per_srcs_dst = compute_rtts_per_dst_src(
 77 |         PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=70
 78 |     )
 79 | 
 80 |     vps_per_target = {
 81 |         dst: set(vp_coordinates_per_ip.keys()) for dst in rtt_per_srcs_dst
 82 |     }
 83 |     features = compute_geolocation_features_per_ip(
 84 |         rtt_per_srcs_dst,
 85 |         vp_coordinates_per_ip,
 86 |         THRESHOLD_DISTANCES,
 87 |         vps_per_target=vps_per_target,
 88 |         distance_operator=">",
 89 |         max_vps=100000,
 90 |         is_use_prefix=False,
 91 |         vp_distance_matrix=vp_distance_matrix,
 92 |     )
 93 | 
 94 |     dump_json(features, PROBES_TO_ANCHORS_RESULT_FILE)
 95 | 
 96 |     logger.info("Step 2: Round Algorithm")
 97 | 
 98 |     all_probes = load_json(PROBES_AND_ANCHORS_FILE)
 99 | 
100 |     asn_per_vp_ip = {}
101 |     vp_coordinates_per_ip = {}
102 | 
103 |     for probe in all_probes:
104 |         if (
105 |             "address_v4" in probe
106 |             and "geometry" in probe
107 |             and "coordinates" in probe["geometry"]
108 |         ):
109 |             ip_v4_address = probe["address_v4"]
110 |             if ip_v4_address is None:
111 |                 continue
112 |             long, lat = probe["geometry"]["coordinates"]
113 |             asn_v4 = probe["asn_v4"]
114 |             asn_per_vp_ip[ip_v4_address] = asn_v4
115 |             vp_coordinates_per_ip[ip_v4_address] = lat, long
116 | 
117 |     # clickhouse is required here
118 |     rtt_per_srcs_dst = compute_rtts_per_dst_src(
119 |         PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=100
120 |     )
121 |     vp_distance_matrix = load_json(PAIRWISE_DISTANCE_FILE)
122 | 
123 |     TIER1_VPS = [10, 100, 300, 500, 1000]
124 |     greedy_probes = load_json(GREEDY_PROBES_FILE)
125 |     error_cdf_per_tier1_vps = {}
126 |     for tier1_vps in TIER1_VPS:
127 |         print(f"Using {tier1_vps} tier1_vps")
128 |         error_cdf = round_based_algorithm(
129 |             greedy_probes,
130 |             rtt_per_srcs_dst,
131 |             vp_coordinates_per_ip,
132 |             asn_per_vp_ip,
133 |             tier1_vps,
134 |             threshold=40,
135 |         )
136 |         error_cdf_per_tier1_vps[tier1_vps] = error_cdf
137 | 
138 |     dump_json(error_cdf_per_tier1_vps, ROUND_BASED_ALGORITHM_FILE)
139 | 
140 |     logger.info("Accuracy vs number of vps probes")
141 |     logger.warning("this step might takes several hours")
142 | 
143 |     all_probes = load_json(PROBES_AND_ANCHORS_FILE)
144 | 
145 |     (
146 |         vp_coordinates_per_ip,
147 |         ip_per_coordinates,
148 |         country_per_vp,
149 |         asn_per_vp,
150 |         vp_distance_matrix,
151 |         probe_per_ip,
152 |     ) = compute_geo_info(all_probes, serialized_file=PAIRWISE_DISTANCE_FILE)
153 | 
154 |     logger.info("Accuracy vs number of vps probes")
155 | 
156 |     subset_sizes = []
157 |     subset_sizes.extend([i for i in range(100, 1000, 100)])
158 |     # subset_sizes.extend([i for i in range(1000, 10001, 1000)])
159 | 
160 |     rtt_per_srcs_dst = compute_rtts_per_dst_src(
161 |         PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=50
162 |     )
163 | 
164 |     available_vps = list(vp_coordinates_per_ip.keys())
165 |     accuracy_vs_nb_vps = compute_accuracy_vs_number_of_vps(
166 |         available_vps,
167 |         rtt_per_srcs_dst,
168 |         vp_coordinates_per_ip,
169 |         vp_distance_matrix,
170 |         subset_sizes,
171 |     )
172 | 
173 |     dump_json(accuracy_vs_nb_vps, ACCURACY_VS_N_VPS_PROBES_FILE)
174 | 
175 |     logger.info("vp selection algorithm")
176 | 
177 |     all_probes = load_json(PROBES_AND_ANCHORS_FILE)
178 | 
179 |     (
180 |         vp_coordinates_per_ip,
181 |         ip_per_coordinates,
182 |         country_per_vp,
183 |         asn_per_vp,
184 |         vp_distance_matrix,
185 |         probes_per_ip,
186 |     ) = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)
187 | 
188 |     ping_table_prefix = PROBES_TO_PREFIX_TABLE
189 |     ping_table = PROBES_TO_ANCHORS_PING_TABLE
190 |     N_VPS_SELECTION_ALGORITHM = [1, 3, 10]
191 |     results_files = [
192 |         VP_SELECTION_ALGORITHM_PROBES_1_FILE,
193 |         VP_SELECTION_ALGORITHM_PROBES_3_FILE,
194 |         VP_SELECTION_ALGORITHM_PROBES_10_FILE,
195 |     ]
196 | 
197 |     rtt_per_srcs_dst_prefix = compute_rtts_per_dst_src(
198 |         ping_table_prefix, filter, threshold=100, is_per_prefix=True
199 |     )
200 |     rtt_per_srcs_dst = compute_rtts_per_dst_src(ping_table, filter, threshold=70)
201 | 
202 |     for i, n_vp in enumerate(N_VPS_SELECTION_ALGORITHM):
203 |         vps_per_target = compute_closest_rtt_probes(
204 |             rtt_per_srcs_dst_prefix,
205 |             vp_coordinates_per_ip,
206 |             vp_distance_matrix,
207 |             n_shortest=n_vp,
208 |             is_prefix=True,
209 |         )
210 |         features = compute_geolocation_features_per_ip(
211 |             rtt_per_srcs_dst,
212 |             vp_coordinates_per_ip,
213 |             [0],
214 |             vps_per_target=vps_per_target,
215 |             distance_operator=">",
216 |             max_vps=100000,
217 |             is_use_prefix=True,
218 |             vp_distance_matrix=vp_distance_matrix,
219 |             is_multiprocess=True,
220 |         )
221 | 
222 |         ofile = results_files[i]
223 |         dump_json(features, ofile)
224 | 


--------------------------------------------------------------------------------
/analysis/ripe_atlas_probes_bias.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import json\n",
 10 |     "import pandas as pd\n",
 11 |     "\n",
 12 |     "from default import ASNS_TYPE_CAIDA, ASNS_TYPE_STANFORD, REPRO_PROBES_AND_ANCHORS_FILE, REPRO_ANCHORS_FILE, REPRO_PROBES_FILE"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# load datasets"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "with ASNS_TYPE_CAIDA.open(\"r\") as f:\n",
 29 |     "    asns_categories_caida = json.load(f)\n",
 30 |     "\n",
 31 |     "with ASNS_TYPE_STANFORD.open(\"r\") as f:\n",
 32 |     "    asns_categories_stanford = json.load(f)\n",
 33 |     "    \n",
 34 |     "with REPRO_PROBES_AND_ANCHORS_FILE.open(\"r\") as f:\n",
 35 |     "    probes_and_anchors = json.load(f)\n",
 36 |     "\n",
 37 |     "with REPRO_PROBES_FILE.open(\"r\") as f:\n",
 38 |     "    probes = json.load(f)\n",
 39 |     "\n",
 40 |     "with REPRO_ANCHORS_FILE.open(\"r\") as f:\n",
 41 |     "    anchors = json.load(f)"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "def get_anchor_as_category(asns_category: dict, ripe_vps_dataset: dict) -> dict:\n",
 51 |     "    \"\"\"return one category per anchor\"\"\"\n",
 52 |     "    ripe_categories = []\n",
 53 |     "\n",
 54 |     "    for ripe_vp in ripe_vps_dataset:\n",
 55 |     "        try:\n",
 56 |     "            ripe_categories.append({\n",
 57 |     "                \"id\": ripe_vp['id'],\n",
 58 |     "                \"category\": asns_category[str(ripe_vp[\"asn_v4\"])]\n",
 59 |     "            })\n",
 60 |     "        except KeyError:\n",
 61 |     "            ripe_categories.append({\n",
 62 |     "                \"id\": ripe_vp['id'],\n",
 63 |     "                \"category\": \"Unknown\"\n",
 64 |     "            })\n",
 65 |     "            continue\n",
 66 |     "    return ripe_categories\n",
 67 |     "\n",
 68 |     "def get_categories_percentage(categories_df: pd.DataFrame) -> dict:\n",
 69 |     "    \"\"\"get percentage per categories from a set of categories\"\"\"\n",
 70 |     "    category_repartition = dict()\n",
 71 |     "\n",
 72 |     "    category_set = categories_df[\"category\"].unique()\n",
 73 |     "    for category in category_set:\n",
 74 |     "        percentage = len(categories_df[categories_df[\"category\"] == category]) * 100 / len(categories_df[\"id\"])\n",
 75 |     "        category_repartition[category] = percentage\n",
 76 |     "\n",
 77 |     "        print(f\"{category} : {len(categories_df[categories_df['category'] == category])} ({round(percentage,1)}%)\")\n",
 78 |     "\n",
 79 |     "    assert round(sum([v for v in category_repartition.values()])) == 100  \n",
 80 |     "\n",
 81 |     "    return category_repartition"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "# Get targets type"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 4,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "category_caida_anchors = get_anchor_as_category(asns_categories_caida, anchors)\n",
 98 |     "category_caida_probes = get_anchor_as_category(asns_categories_caida, probes)\n",
 99 |     "category_caida_probes_and_anchors = get_anchor_as_category(asns_categories_caida, probes_and_anchors)\n",
100 |     "\n",
101 |     "category_stanford_anchors = get_anchor_as_category(asns_categories_stanford, anchors)\n",
102 |     "category_stanford_probes = get_anchor_as_category(asns_categories_stanford, probes)\n",
103 |     "category_stanford_probes_and_anchors = get_anchor_as_category(asns_categories_stanford, probes_and_anchors)\n",
104 |     "\n",
105 |     "caida_df_anchors = pd.DataFrame(category_caida_anchors, columns=[\"id\", \"category\"])\n",
106 |     "caida_df_probes = pd.DataFrame(category_caida_probes, columns=[\"id\", \"category\"])\n",
107 |     "caida_df_probes_and_anchors = pd.DataFrame(category_caida_probes_and_anchors, columns=[\"id\", \"category\"])\n",
108 |     "\n",
109 |     "stanford_df_anchors = pd.DataFrame(category_stanford_anchors, columns=[\"id\", \"category\"])\n",
110 |     "stanford_df_probes = pd.DataFrame(category_stanford_probes, columns=[\"id\", \"category\"])\n",
111 |     "stanford_df_probes_and_anchors = pd.DataFrame(category_stanford_probes_and_anchors, columns=[\"id\", \"category\"])"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "# Caida categories"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 5,
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "Anchors results: \n",
131 |       "\n",
132 |       "Content : 229 (31.7%)\n",
133 |       "Access : 211 (29.2%)\n",
134 |       "Transit/Access : 197 (27.2%)\n",
135 |       "Enterprise : 55 (7.6%)\n",
136 |       "tier-1 : 6 (0.8%)\n",
137 |       "Unknown : 25 (3.5%)\n",
138 |       "\n",
139 |       "Probes results: \n",
140 |       "\n",
141 |       "Access : 9124 (75.2%)\n",
142 |       "Transit/Access : 1005 (8.3%)\n",
143 |       "Enterprise : 410 (3.4%)\n",
144 |       "Unknown : 312 (2.6%)\n",
145 |       "Content : 1112 (9.2%)\n",
146 |       "tier-1 : 166 (1.4%)\n",
147 |       "\n",
148 |       "Probes and anchors results: \n",
149 |       "\n",
150 |       "Access : 9347 (72.4%)\n",
151 |       "Transit/Access : 1221 (9.5%)\n",
152 |       "Enterprise : 472 (3.7%)\n",
153 |       "Unknown : 339 (2.6%)\n",
154 |       "Content : 1361 (10.5%)\n",
155 |       "tier-1 : 174 (1.3%)\n",
156 |       "\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "print(\"Anchors results: \\n\")\n",
162 |     "ripe_vps_categories_caida = get_categories_percentage(caida_df_anchors)\n",
163 |     "print()\n",
164 |     "\n",
165 |     "print(\"Probes results: \\n\")\n",
166 |     "ripe_vps_categories_caida = get_categories_percentage(caida_df_probes)\n",
167 |     "print()\n",
168 |     "\n",
169 |     "print(\"Probes and anchors results: \\n\")\n",
170 |     "ripe_vps_categories_caida = get_categories_percentage(caida_df_probes_and_anchors)\n",
171 |     "print()"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "# Stanford categories"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 6,
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "name": "stdout",
188 |      "output_type": "stream",
189 |      "text": [
190 |       "Anchors results: \n",
191 |       "\n",
192 |       "Computer and Information Technology : 521 (72.1%)\n",
193 |       "Education and Research : 38 (5.3%)\n",
194 |       "Community Groups and Nonprofits : 33 (4.6%)\n",
195 |       "Health Care Services : 2 (0.3%)\n",
196 |       "Finance and Insurance : 6 (0.8%)\n",
197 |       "Unknown : 53 (7.3%)\n",
198 |       "Media, Publishing, and Broadcasting : 21 (2.9%)\n",
199 |       "Service : 25 (3.5%)\n",
200 |       "Construction and Real Estate : 5 (0.7%)\n",
201 |       "Travel and Accommodation : 2 (0.3%)\n",
202 |       "Government and Public Administration : 3 (0.4%)\n",
203 |       "Retail Stores, Wholesale, and E-commerce Sites : 5 (0.7%)\n",
204 |       "Utilities (Excluding Internet Service) : 1 (0.1%)\n",
205 |       "Manufacturing : 2 (0.3%)\n",
206 |       "Other : 4 (0.6%)\n",
207 |       "Museums, Libraries, and Entertainment : 1 (0.1%)\n",
208 |       "Freight, Shipment, and Postal Services : 1 (0.1%)\n",
209 |       "\n",
210 |       "Probes results: \n",
211 |       "\n",
212 |       "Computer and Information Technology : 10028 (82.7%)\n",
213 |       "Community Groups and Nonprofits : 129 (1.1%)\n",
214 |       "Unknown : 842 (6.9%)\n",
215 |       "Education and Research : 352 (2.9%)\n",
216 |       "Construction and Real Estate : 60 (0.5%)\n",
217 |       "Manufacturing : 25 (0.2%)\n",
218 |       "Service : 300 (2.5%)\n",
219 |       "Media, Publishing, and Broadcasting : 183 (1.5%)\n",
220 |       "Other : 14 (0.1%)\n",
221 |       "Retail Stores, Wholesale, and E-commerce Sites : 105 (0.9%)\n",
222 |       "Government and Public Administration : 18 (0.1%)\n",
223 |       "Health Care Services : 8 (0.1%)\n",
224 |       "Finance and Insurance : 22 (0.2%)\n",
225 |       "Utilities (Excluding Internet Service) : 16 (0.1%)\n",
226 |       "Museums, Libraries, and Entertainment : 8 (0.1%)\n",
227 |       "Travel and Accommodation : 10 (0.1%)\n",
228 |       "Agriculture, Mining, and Refineries (Farming, Greenhouses, Mining, Forestry, and Animal Farming) : 4 (0.0%)\n",
229 |       "Freight, Shipment, and Postal Services : 5 (0.0%)\n",
230 |       "\n",
231 |       "Probes and anchors results: \n",
232 |       "\n",
233 |       "Computer and Information Technology : 10590 (82.0%)\n",
234 |       "Community Groups and Nonprofits : 163 (1.3%)\n",
235 |       "Unknown : 901 (7.0%)\n",
236 |       "Education and Research : 393 (3.0%)\n",
237 |       "Construction and Real Estate : 65 (0.5%)\n",
238 |       "Manufacturing : 27 (0.2%)\n",
239 |       "Service : 328 (2.5%)\n",
240 |       "Media, Publishing, and Broadcasting : 206 (1.6%)\n",
241 |       "Other : 19 (0.1%)\n",
242 |       "Retail Stores, Wholesale, and E-commerce Sites : 115 (0.9%)\n",
243 |       "Government and Public Administration : 21 (0.2%)\n",
244 |       "Health Care Services : 10 (0.1%)\n",
245 |       "Finance and Insurance : 28 (0.2%)\n",
246 |       "Utilities (Excluding Internet Service) : 17 (0.1%)\n",
247 |       "Museums, Libraries, and Entertainment : 9 (0.1%)\n",
248 |       "Travel and Accommodation : 12 (0.1%)\n",
249 |       "Agriculture, Mining, and Refineries (Farming, Greenhouses, Mining, Forestry, and Animal Farming) : 4 (0.0%)\n",
250 |       "Freight, Shipment, and Postal Services : 6 (0.0%)\n",
251 |       "\n"
252 |      ]
253 |     }
254 |    ],
255 |    "source": [
256 |     "print(\"Anchors results: \\n\")\n",
257 |     "ripe_vps_categories_caida = get_categories_percentage(stanford_df_anchors)\n",
258 |     "print()\n",
259 |     "\n",
260 |     "print(\"Probes results: \\n\")\n",
261 |     "ripe_vps_categories_caida = get_categories_percentage(stanford_df_probes)\n",
262 |     "print()\n",
263 |     "\n",
264 |     "print(\"Probes and anchors results: \\n\")\n",
265 |     "ripe_vps_categories_caida = get_categories_percentage(stanford_df_probes_and_anchors)\n",
266 |     "print()"
267 |    ]
268 |   }
269 |  ],
270 |  "metadata": {
271 |   "kernelspec": {
272 |    "display_name": "review-QY-dYH-y-py3.10",
273 |    "language": "python",
274 |    "name": "python3"
275 |   },
276 |   "language_info": {
277 |    "codemirror_mode": {
278 |     "name": "ipython",
279 |     "version": 3
280 |    },
281 |    "file_extension": ".py",
282 |    "mimetype": "text/x-python",
283 |    "name": "python",
284 |    "nbconvert_exporter": "python",
285 |    "pygments_lexer": "ipython3",
286 |    "version": "3.10.9"
287 |   },
288 |   "orig_nbformat": 4
289 |  },
290 |  "nbformat": 4,
291 |  "nbformat_minor": 2
292 | }
293 | 


--------------------------------------------------------------------------------
/analysis/tables.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": [
  7 |                 "# Print tables\n",
  8 |                 "\n",
  9 |                 "Print all the tables of the replication paper  \n",
 10 |                 "To do after analysis/million_scale.ipynb"
 11 |             ]
 12 |         },
 13 |         {
 14 |             "cell_type": "code",
 15 |             "execution_count": 1,
 16 |             "metadata": {},
 17 |             "outputs": [],
 18 |             "source": [
 19 |                 "import pyasn\n",
 20 |                 "\n",
 21 |                 "from ipaddress import ip_network\n",
 22 |                 "from clickhouse_driver import Client\n",
 23 |                 "\n",
 24 |                 "from scripts.utils.file_utils import load_json\n",
 25 |                 "from scripts.utils.clickhouse import Clickhouse\n",
 26 |                 "from scripts.analysis.analysis import get_all_bgp_prefixes, is_same_bgp_prefix, every_tier_result_and_errors\n",
 27 |                 "from scripts.utils.helpers import haversine\n",
 28 |                 "from default import IP_TO_ASN_FILE, ANALYZABLE_FILE, ROUND_BASED_ALGORITHM_FILE, TARGET_TO_LANDMARKS_PING_TABLE"
 29 |             ]
 30 |         },
 31 |         {
 32 |             "cell_type": "markdown",
 33 |             "metadata": {},
 34 |             "source": [
 35 |                 "## Measurement overhead"
 36 |             ]
 37 |         },
 38 |         {
 39 |             "cell_type": "markdown",
 40 |             "metadata": {},
 41 |             "source": [
 42 |                 "### Figure 3.c of the replication paper"
 43 |             ]
 44 |         },
 45 |         {
 46 |             "cell_type": "code",
 47 |             "execution_count": 2,
 48 |             "metadata": {},
 49 |             "outputs": [],
 50 |             "source": [
 51 |                 "round_based_algorithm_results = load_json(ROUND_BASED_ALGORITHM_FILE)\n",
 52 |                 "\n",
 53 |                 "round_based_algorithm_results = {\n",
 54 |                 "int(x): round_based_algorithm_results[x] for x in round_based_algorithm_results}"
 55 |             ]
 56 |         },
 57 |         {
 58 |             "cell_type": "code",
 59 |             "execution_count": 3,
 60 |             "metadata": {},
 61 |             "outputs": [
 62 |                 {
 63 |                     "name": "stdout",
 64 |                     "output_type": "stream",
 65 |                     "text": [
 66 |                         "10 5785182\n",
 67 |                         "100 4459050\n",
 68 |                         "300 3205290\n",
 69 |                         "500 2800245\n",
 70 |                         "1000 2817933\n"
 71 |                     ]
 72 |                 }
 73 |             ],
 74 |             "source": [
 75 |                 "for tier1_vps, results in sorted(round_based_algorithm_results.items()):\n",
 76 |                 "        tier1_vps = int(tier1_vps)\n",
 77 |                 "        n_vps_cdf = [r[2] + tier1_vps for r in results if r[2] is not None]\n",
 78 |                 "        print(tier1_vps, 3 * sum(n_vps_cdf))"
 79 |             ]
 80 |         },
 81 |         {
 82 |             "cell_type": "markdown",
 83 |             "metadata": {},
 84 |             "source": [
 85 |                 "## Number of landmarks within a certain radius"
 86 |             ]
 87 |         },
 88 |         {
 89 |             "cell_type": "markdown",
 90 |             "metadata": {},
 91 |             "source": [
 92 |                 "### Figure 5.b of the replication paper"
 93 |             ]
 94 |         },
 95 |         {
 96 |             "cell_type": "code",
 97 |             "execution_count": 3,
 98 |             "metadata": {},
 99 |             "outputs": [
100 |                 {
101 |                     "name": "stdout",
102 |                     "output_type": "stream",
103 |                     "text": [
104 |                         "Found 78.128.211.119 with a landmark in the same /24\n",
105 |                         "Found 77.109.180.62 with a landmark in the same /24\n",
106 |                         "Found 103.143.136.43 with a landmark in the same /24\n"
107 |                     ]
108 |                 }
109 |             ],
110 |             "source": [
111 |                 "data = load_json(ANALYZABLE_FILE)\n",
112 |                 "\n",
113 |                 "valid_landmarks_count = 0\n",
114 |                 "unvalid_landmarks_count = 0\n",
115 |                 "same_asn_lst = []\n",
116 |                 "same_24_lst = []\n",
117 |                 "same_bgp_lst = []\n",
118 |                 "distances_to_landmarks = []\n",
119 |                 "all_landmarks = []\n",
120 |                 "asndb = pyasn.pyasn(str(IP_TO_ASN_FILE))\n",
121 |                 "bgp_prefixes = get_all_bgp_prefixes()\n",
122 |                 "\n",
123 |                 "for _, d in data.items():\n",
124 |                 "    same_asn = 0\n",
125 |                 "    diff_asn = 0\n",
126 |                 "    same_bgp = 0\n",
127 |                 "    diff_bgp = 0\n",
128 |                 "    same_24 = 0\n",
129 |                 "    diff_24 = 0\n",
130 |                 "    all_landmarks.append(0)\n",
131 |                 "    if \"tier2:cdn_count\" in d and \"tier2:landmark_count\" in d and \"tier2:failed_header_test_count\" in d:\n",
132 |                 "        all_landmarks[-1] += d['tier2:landmark_count'] + \\\n",
133 |                 "            d['tier2:cdn_count'] + d['tier2:failed_header_test_count']\n",
134 |                 "        valid_landmarks_count += d['tier2:landmark_count']\n",
135 |                 "        unvalid_landmarks_count += d['tier2:cdn_count'] + \\\n",
136 |                 "            d['tier2:failed_header_test_count']\n",
137 |                 "    if \"tier3:cdn_count\" in d and \"tier3:landmark_count\" in d and \"tier3:failed_header_test_count\" in d:\n",
138 |                 "        all_landmarks[-1] += d['tier3:landmark_count'] + \\\n",
139 |                 "            d['tier3:cdn_count'] + d['tier3:failed_header_test_count']\n",
140 |                 "        valid_landmarks_count += d['tier3:landmark_count']\n",
141 |                 "        unvalid_landmarks_count += d['tier3:cdn_count'] + \\\n",
142 |                 "            d['tier3:failed_header_test_count']\n",
143 |                 "    for f in ['tier2:traceroutes', 'tier3:traceroutes']:\n",
144 |                 "        if f in d:\n",
145 |                 "            for t in d[f]:\n",
146 |                 "\n",
147 |                 "                ipt = t[1]\n",
148 |                 "                ipl = t[2]\n",
149 |                 "                asnt = asndb.lookup(ipt)[0]\n",
150 |                 "                asnl = asndb.lookup(ipl)[0]\n",
151 |                 "                if asnl != None and asnt != None:\n",
152 |                 "                    if asnt == asnl:\n",
153 |                 "                        same_asn += 1\n",
154 |                 "                    else:\n",
155 |                 "                        diff_asn += 1\n",
156 |                 "\n",
157 |                 "                nt = ip_network(ipt+\"/24\", strict=False).network_address\n",
158 |                 "                nl = ip_network(ipl+\"/24\", strict=False).network_address\n",
159 |                 "                if nt == nl:\n",
160 |                 "                    same_24 += 1\n",
161 |                 "                else:\n",
162 |                 "                    diff_24 += 1\n",
163 |                 "\n",
164 |                 "                if is_same_bgp_prefix(ipt, ipl, bgp_prefixes):\n",
165 |                 "                    same_bgp += 1\n",
166 |                 "                else:\n",
167 |                 "                    diff_bgp += 1\n",
168 |                 "\n",
169 |                 "    distances = []\n",
170 |                 "    for f in ['tier2:landmarks', 'tier3:landmarks']:\n",
171 |                 "        target_geo = (d['RIPE:lat'], d['RIPE:lon'])\n",
172 |                 "        if f in d:\n",
173 |                 "            for l in d[f]:\n",
174 |                 "                landmark_geo = (l[2], l[3])\n",
175 |                 "                distances.append(haversine(target_geo, landmark_geo))\n",
176 |                 "    distances_to_landmarks.append(distances)\n",
177 |                 "\n",
178 |                 "    if same_asn != 0 or diff_asn != 0:\n",
179 |                 "        same_asn_lst.append(same_asn/(same_asn+diff_asn))\n",
180 |                 "\n",
181 |                 "    if same_24 != 0 or diff_24 != 0:\n",
182 |                 "        same_24_lst.append(same_24/(same_24+diff_24))\n",
183 |                 "        if same_24 != 0:\n",
184 |                 "            print(\n",
185 |                 "                f\"Found {d['target_ip']} with a landmark in the same /24\")\n",
186 |                 "    if same_bgp != 0 or diff_bgp != 0:\n",
187 |                 "        same_bgp_lst.append(same_bgp/(diff_bgp+same_bgp))"
188 |             ]
189 |         },
190 |         {
191 |             "cell_type": "code",
192 |             "execution_count": 4,
193 |             "metadata": {},
194 |             "outputs": [
195 |                 {
196 |                     "name": "stdout",
197 |                     "output_type": "stream",
198 |                     "text": [
199 |                         "713 target have potentail landmarks or 0.9861687413554634\n",
200 |                         "677 target have valid landmarks or 0.9363762102351314\n",
201 |                         "207 target with a landmark within 1 km or 0.2863070539419087\n",
202 |                         "419 target with a landmark within 5 km or 0.5795297372060858\n",
203 |                         "464 target with a landmark within 10 km or 0.6417704011065007\n",
204 |                         "552 target with a landmark within 40 km or 0.7634854771784232\n"
205 |                     ]
206 |                 }
207 |             ],
208 |             "source": [
209 |                 "landmarks_all = []\n",
210 |                 "landmarks_less_1 = []\n",
211 |                 "landmarks_less_5 = []\n",
212 |                 "landmarks_less_10 = []\n",
213 |                 "landmarks_less_40 = []\n",
214 |                 "\n",
215 |                 "for landmark_distances in distances_to_landmarks:\n",
216 |                 "    landmarks_all.append(len(landmark_distances))\n",
217 |                 "    landmarks_less_1.append(len([i for i in landmark_distances if i <= 1]))\n",
218 |                 "    landmarks_less_5.append(len([i for i in landmark_distances if i <= 5]))\n",
219 |                 "    landmarks_less_10.append(\n",
220 |                 "        len([i for i in landmark_distances if i <= 10]))\n",
221 |                 "    landmarks_less_40.append(\n",
222 |                 "        len([i for i in landmark_distances if i <= 40]))\n",
223 |                 "\n",
224 |                 "lm_a_0 = len([i for i in all_landmarks if i > 0])\n",
225 |                 "lmv_a_0 = len([i for i in landmarks_all if i > 0])\n",
226 |                 "lm1_0 = len([i for i in landmarks_less_1 if i > 0])\n",
227 |                 "lm5_0 = len([i for i in landmarks_less_5 if i > 0])\n",
228 |                 "lm10_0 = len([i for i in landmarks_less_10 if i > 0])\n",
229 |                 "lm40_0 = len([i for i in landmarks_less_40 if i > 0])\n",
230 |                 "\n",
231 |                 "\n",
232 |                 "len_all = len(data)\n",
233 |                 "print(f\"{lm_a_0} target have potentail landmarks or {lm_a_0/len_all}\")\n",
234 |                 "print(f\"{lmv_a_0} target have valid landmarks or {lmv_a_0/len_all}\")\n",
235 |                 "print(f\"{lm1_0} target with a landmark within 1 km or {lm1_0/len_all}\")\n",
236 |                 "print(f\"{lm5_0} target with a landmark within 5 km or {lm5_0/len_all}\")\n",
237 |                 "print(f\"{lm10_0} target with a landmark within 10 km or {lm10_0/len_all}\")\n",
238 |                 "print(f\"{lm40_0} target with a landmark within 40 km or {lm40_0/len_all}\")"
239 |             ]
240 |         },
241 |         {
242 |             "cell_type": "code",
243 |             "execution_count": 5,
244 |             "metadata": {},
245 |             "outputs": [
246 |                 {
247 |                     "name": "stderr",
248 |                     "output_type": "stream",
249 |                     "text": [
250 |                         "2023-09-14 13:19:51::INFO:root:analysis:: Tier1 Failed\n"
251 |                     ]
252 |                 },
253 |                 {
254 |                     "name": "stdout",
255 |                     "output_type": "stream",
256 |                     "text": [
257 |                         "207 targets with landmarks (ping <= 1) or 0.2863070539419087\n",
258 |                         "419 targets with landmarks (ping <= 5) or 0.5795297372060858\n",
259 |                         "464 targets with landmarks (ping <= 10) or 0.6417704011065007\n",
260 |                         "552 targets with landmarks (ping <= 40) or 0.7634854771784232\n",
261 |                         "723 targets with landmarks (ping <= 9999999999) or 1.0\n"
262 |                     ]
263 |                 }
264 |             ],
265 |             "source": [
266 |                 "clickhouse_driver = Clickhouse()\n",
267 |                 "query = clickhouse_driver.get_min_rtt_per_src_dst_prefix_query(TARGET_TO_LANDMARKS_PING_TABLE, filter=\"\", threshold=1000000)\n",
268 |                 "db_table = clickhouse_driver.execute(query)\n",
269 |                 "\n",
270 |                 "rtts = []\n",
271 |                 "remove_dict = {}\n",
272 |                 "for l in db_table:\n",
273 |                 "    rtts.append(l[2])\n",
274 |                 "    remove_dict[(l[0], l[1])] = l[2]\n",
275 |                 "\n",
276 |                 "error1 = []\n",
277 |                 "error2 = []\n",
278 |                 "error3 = []\n",
279 |                 "error4 = []\n",
280 |                 "error1ms = []\n",
281 |                 "error2ms = []\n",
282 |                 "error5ms = []\n",
283 |                 "error10ms = []\n",
284 |                 "\n",
285 |                 "for _, d in data.items():\n",
286 |                 "    errors = every_tier_result_and_errors(d)\n",
287 |                 "    error1.append(errors['error1'])\n",
288 |                 "    error2.append(errors['error2'])\n",
289 |                 "    error3.append(errors['error3'])\n",
290 |                 "    error4.append(errors['error4'])\n",
291 |                 "    err1ms = 50000\n",
292 |                 "    err2ms = 50000\n",
293 |                 "    err5ms = 50000\n",
294 |                 "    err10ms = 50000\n",
295 |                 "    for f in ['tier2:landmarks', 'tier3:landmarks']:\n",
296 |                 "        if f in d:\n",
297 |                 "            for l_ip, _, l_lat, l_lon in d[f]:\n",
298 |                 "                dist = haversine((l_lat, l_lon), (d['RIPE:lat'], d['RIPE:lon']))\n",
299 |                 "                key_rtt = (l_ip, d['target_ip'])\n",
300 |                 "                if dist < err1ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 1):\n",
301 |                 "                    err1ms = dist\n",
302 |                 "                if dist < err2ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 2):\n",
303 |                 "                    err2ms = dist\n",
304 |                 "                if dist < err5ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 5):\n",
305 |                 "                    err5ms = dist\n",
306 |                 "                if dist < err10ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 10):\n",
307 |                 "                    err10ms = dist\n",
308 |                 "    if err1ms != 50000:\n",
309 |                 "        error1ms.append(err1ms)\n",
310 |                 "    else:\n",
311 |                 "        error1ms.append(error1[-1])\n",
312 |                 "    if err2ms != 50000:\n",
313 |                 "        error2ms.append(err2ms)\n",
314 |                 "    else:\n",
315 |                 "        error2ms.append(error1[-1])\n",
316 |                 "    if err5ms != 50000:\n",
317 |                 "        error5ms.append(err5ms)\n",
318 |                 "    else:\n",
319 |                 "        error5ms.append(error1[-1])\n",
320 |                 "    if err10ms != 50000:\n",
321 |                 "        error10ms.append(err10ms)\n",
322 |                 "    else:\n",
323 |                 "        error10ms.append(error1[-1])\n",
324 |                 "\n",
325 |                 "for i in [1, 5, 10, 40, 9999999999]:\n",
326 |                 "    c = len([j for j in error1ms if j <= i])\n",
327 |                 "    print(f\"{c} targets with landmarks (ping <= {i}) or {c/len(error1ms)}\")"
328 |             ]
329 |         }
330 |     ],
331 |     "metadata": {
332 |         "kernelspec": {
333 |             "display_name": "review-8XQ99qZ1-py3.10",
334 |             "language": "python",
335 |             "name": "python3"
336 |         },
337 |         "language_info": {
338 |             "codemirror_mode": {
339 |                 "name": "ipython",
340 |                 "version": 3
341 |             },
342 |             "file_extension": ".py",
343 |             "mimetype": "text/x-python",
344 |             "name": "python",
345 |             "nbconvert_exporter": "python",
346 |             "pygments_lexer": "ipython3",
347 |             "version": "3.9.13"
348 |         },
349 |         "orig_nbformat": 4
350 |     },
351 |     "nbformat": 4,
352 |     "nbformat_minor": 2
353 | }
354 | 


--------------------------------------------------------------------------------
/clickhouse_files/init-db.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | clickhouse client -n <<-EOSQL
5 |     CREATE DATABASE IF NOT EXISTS geolocation_replication;
6 | EOSQL
7 | 


--------------------------------------------------------------------------------
/clickhouse_files/users.d/default.xml:
--------------------------------------------------------------------------------
1 | <clickhouse>
2 |     <users>
3 |         <default>
4 |             <access_management>1</access_management>
5 |         </default>
6 |     </users>
7 | </clickhouse>
8 | 


--------------------------------------------------------------------------------
/default.py:
--------------------------------------------------------------------------------
  1 | """All the reference paths to storing files settings and constants"""
  2 | 
  3 | from pathlib import Path
  4 | 
  5 | # Default path
  6 | DEFAULT_DIR: Path = Path(__file__).resolve().parent
  7 | 
  8 | 
  9 | ##################################################################################################
 10 | # CONSTANTS                                                                                      #
 11 | ##################################################################################################
 12 | THRESHOLD_DISTANCES = [0, 40, 100, 500, 1000]
 13 | SPEED_OF_LIGHT = 300000
 14 | SPEED_OF_INTERNET = SPEED_OF_LIGHT * 2 / 3
 15 | 
 16 | 
 17 | # Atlas path
 18 | ATLAS_PATH: Path = DEFAULT_DIR / "datasets/atlas/"
 19 | ##################################################################################################
 20 | # REPRODUCIBILITY DATASET FILES (static)                                                         #
 21 | ##################################################################################################
 22 | REPRO_PATH: Path = DEFAULT_DIR / "datasets/reproducibility_datasets/"
 23 | REPRO_ATLAS_PATH: Path = REPRO_PATH / "atlas/"
 24 | REPRO_GENERATED_PATH: Path = REPRO_PATH / "generated/"
 25 | 
 26 | REPRO_ANCHORS_FILE: Path = REPRO_ATLAS_PATH / "reproducibility_anchors.json"
 27 | REPRO_PROBES_FILE: Path = REPRO_ATLAS_PATH / "reproducibility_probes.json"
 28 | REPRO_PROBES_AND_ANCHORS_FILE: Path = (
 29 |     REPRO_ATLAS_PATH / "reproducibility_probes_and_anchors.json"
 30 | )
 31 | 
 32 | REPRO_PAIRWISE_DISTANCE_FILE: Path = (
 33 |     REPRO_GENERATED_PATH / "reproducibility_pairwise_distance_ripe_probes.json"
 34 | )
 35 | REPRO_REMOVED_PROBES_FILE: Path = (
 36 |     REPRO_GENERATED_PATH / "reproducibility_removed_probes.json"
 37 | )
 38 | REPRO_FILTERED_PROBES_FILE: Path = (
 39 |     REPRO_GENERATED_PATH / "reproducibility_filtered_probes.json"
 40 | )
 41 | REPRO_GREEDY_PROBES_FILE: Path = (
 42 |     REPRO_GENERATED_PATH / "reproducibility_greedy_probes.json"
 43 | )
 44 | REPRO_HITLIST_FILE: Path = REPRO_GENERATED_PATH / "reproducibility_parsed_hitlist.json"
 45 | 
 46 | 
 47 | ##################################################################################################
 48 | # USER DATASET FILES (generated)                                                                 #
 49 | ##################################################################################################
 50 | USER_PATH: Path = DEFAULT_DIR / "datasets/user_datasets/"
 51 | USER_ATLAS_PATH: Path = USER_PATH / "atlas/"
 52 | USER_GENERATED_PATH: Path = USER_PATH / "generated/"
 53 | 
 54 | USER_ANCHORS_FILE: Path = USER_ATLAS_PATH / "user_anchors.json"
 55 | USER_PROBES_FILE: Path = USER_ATLAS_PATH / "user_probes.json"
 56 | USER_PROBES_AND_ANCHORS_FILE: Path = USER_ATLAS_PATH / "user_probes_and_anchors.json"
 57 | 
 58 | USER_PAIRWISE_DISTANCE_FILE: Path = (
 59 |     USER_GENERATED_PATH / "user_pairwise_distance_ripe_probes.json"
 60 | )
 61 | USER_REMOVED_PROBES_FILE: Path = USER_GENERATED_PATH / "user_removed_probes.json"
 62 | USER_FILTERED_PROBES_FILE: Path = USER_GENERATED_PATH / "user_filtered_probes.json"
 63 | USER_GREEDY_PROBES_FILE: Path = USER_GENERATED_PATH / "user_greedy_probes.json"
 64 | USER_HITLIST_FILE: Path = USER_GENERATED_PATH / "user_parsed_hitlist.json"
 65 | 
 66 | ##################################################################################################
 67 | # CLICKHOUSE SETTINGS                                                                            #
 68 | ##################################################################################################
 69 | CLICKHOUSE_CLIENT = DEFAULT_DIR / "clickhouse_files/clickhouse"
 70 | CLICKHOUSE_HOST = "localhost"
 71 | CLICKHOUSE_DB = "geolocation_replication"
 72 | CLICKHOUSE_USER = "default"
 73 | CLICKHOUSE_PASSWORD = ""
 74 | 
 75 | # tables to store reproduction results
 76 | ANCHORS_MESHED_PING_TABLE = "anchors_meshed_pings"
 77 | ANCHORS_TO_PREFIX_TABLE = "anchors_to_prefix_pings"
 78 | PROBES_TO_PREFIX_TABLE = "probes_to_prefix_pings"
 79 | TARGET_TO_LANDMARKS_PING_TABLE = "targets_to_landmarks_pings"
 80 | PROBES_TO_ANCHORS_PING_TABLE = "ping_10k_to_anchors"
 81 | ANCHORS_MESHED_TRACEROUTE_TABLE = "anchors_meshed_traceroutes"
 82 | STREET_LEVEL_TRACEROUTES_TABLE = "street_lvl_traceroutes"
 83 | 
 84 | # tables to store user measurements
 85 | USER_VPS_TO_PREFIX_TABLE = "user_vps_to_prefix"
 86 | USER_VPS_TO_TARGET_TABLE = "user_vps_to_target"
 87 | 
 88 | USER_TARGET_TO_LANDMARKS_PING_TABLE = "user_targets_to_landmarks_pings"
 89 | USER_ANCHORS_MESHED_TRACEROUTE_TABLE = "user_anchors_meshed_traceroutes"
 90 | USER_STREET_LEVEL_TRACEROUTES_TABLE = "user_street_lvl_traceroutes"
 91 | 
 92 | # reproduction results files
 93 | CLICKHOUSE_STATIC_DATASET: Path = DEFAULT_DIR / "datasets/clickhouse_data"
 94 | 
 95 | ANCHORS_MESHED_PING_FILE = (
 96 |     CLICKHOUSE_STATIC_DATASET / f"{ANCHORS_MESHED_PING_TABLE}.zst"
 97 | )
 98 | ANCHORS_TO_PREFIX_FILE = CLICKHOUSE_STATIC_DATASET / f"{ANCHORS_TO_PREFIX_TABLE}.zst"
 99 | PROBES_TO_PREFIX_FILE = CLICKHOUSE_STATIC_DATASET / f"{PROBES_TO_PREFIX_TABLE}.zst"
100 | TARGET_TO_LANDMARKS_PING_FILE = (
101 |     CLICKHOUSE_STATIC_DATASET / f"{TARGET_TO_LANDMARKS_PING_TABLE}.zst"
102 | )
103 | PROBES_TO_ANCHORS_PING_FILE = (
104 |     CLICKHOUSE_STATIC_DATASET / f"{PROBES_TO_ANCHORS_PING_TABLE}.zst"
105 | )
106 | ANCHORS_MESHED_TRACEROUTE_FILE = (
107 |     CLICKHOUSE_STATIC_DATASET / f"{ANCHORS_MESHED_TRACEROUTE_TABLE}.zst"
108 | )
109 | STREET_LEVEL_TRACEROUTES_FILE = (
110 |     CLICKHOUSE_STATIC_DATASET / f"{STREET_LEVEL_TRACEROUTES_TABLE}.zst"
111 | )
112 | 
113 | 
114 | ##################################################################################################
115 | # RIPE ATLAS VPS BIAS ANALYSIS                                                                   #
116 | ##################################################################################################
117 | ASNS_TYPES: Path = DEFAULT_DIR / "datasets/asns_types"
118 | ASNS_TYPE_CAIDA: Path = ASNS_TYPES / "caida_enhanced_as_type.json"
119 | ASNS_TYPE_STANFORD: Path = ASNS_TYPES / "AS_categories_stanford.json"
120 | 
121 | 
122 | ##################################################################################################
123 | # STATIC FILES                                                                                   #
124 | ##################################################################################################
125 | STATIC_PATH: Path = DEFAULT_DIR / "datasets/static_datasets/"
126 | 
127 | COUNTRIES_JSON_FILE: Path = STATIC_PATH / "countries.json"
128 | COUNTRIES_TXT_FILE: Path = STATIC_PATH / "countries.txt"
129 | COUNTRIES_CSV_FILE: Path = STATIC_PATH / "iso_code_2.csv"
130 | POPULATION_CITY_FILE: Path = STATIC_PATH / "population.json"
131 | CITIES_500_FILE: Path = STATIC_PATH / "cities500.txt"
132 | POPULATION_DENSITY_FILE: Path = (
133 |     STATIC_PATH / "gpw_v4_population_density_rev11_2020_30_sec.tif"
134 | )
135 | 
136 | ADDRESS_FILE: Path = (
137 |     STATIC_PATH / "internet_address_verfploeter_hitlist_it102w-20230125.fsdb"
138 | )
139 | GEOLITE_FILE: Path = STATIC_PATH / "GeoLite2-City-Blocks-IPv4_20230516.tree"
140 | IP_INFO_GEO_FILE: Path = STATIC_PATH / "ip_info_geo_anchors.json"
141 | MAXMIND_GEO_FILE: Path = STATIC_PATH / "maxmind_free_geo_anchors.json"
142 | 
143 | GEOPAPIFY_1_FILE: Path = STATIC_PATH / "geocoded_by_geoapify-10_05_2023_0_500.csv"
144 | GEOPAPIFY_2_FILE: Path = STATIC_PATH / "geocoded_by_geoapify-10_05_2023_500_last.csv"
145 | 
146 | IP_TO_ASN_FILE: Path = STATIC_PATH / "2022-03-28.dat"
147 | ANCHORS_SECOND_PAPER_FILE: Path = STATIC_PATH / "anchors_ip_list.json"
148 | CACHED_WEBSITES_FILE: Path = STATIC_PATH / "websites.json"
149 | BGP_PRIFIXES_FILE: Path = STATIC_PATH / "bgp_prefixes.json"
150 | 
151 | ##################################################################################################
152 | # ANALYSIS RESULTS FILES                                                                         #
153 | ##################################################################################################
154 | 
155 | # REPRODUCIBILITY
156 | REPRO_ANALYSIS_PATH: Path = DEFAULT_DIR / "analysis/results/reproducibility/"
157 | 
158 | REPRO_PROBES_TO_ANCHORS_RESULT_FILE: Path = (
159 |     REPRO_ANALYSIS_PATH / "cbg_thresholds_probes_to_anchors.json"
160 | )
161 | REPRO_VP_SELECTION_ALGORITHM_PROBES_1_FILE: Path = (
162 |     REPRO_ANALYSIS_PATH / "vp_selection_algorithm_probes_1.json"
163 | )
164 | REPRO_VP_SELECTION_ALGORITHM_PROBES_3_FILE: Path = (
165 |     REPRO_ANALYSIS_PATH / "vp_selection_algorithm_probes_3.json"
166 | )
167 | REPRO_VP_SELECTION_ALGORITHM_PROBES_10_FILE: Path = (
168 |     REPRO_ANALYSIS_PATH / "vp_selection_algoxrithm_probes_10.json"
169 | )
170 | REPRO_ACCURACY_VS_N_VPS_PROBES_FILE: Path = (
171 |     REPRO_ANALYSIS_PATH / "accuracy_vs_n_vps_probes.json"
172 | )
173 | REPRO_ROUND_BASED_ALGORITHM_FILE: Path = (
174 |     REPRO_ANALYSIS_PATH / "round_based_algorithm_error_cdf.json"
175 | )
176 | 
177 | # FROM USER MEASUREMENTS
178 | USER_ANALYSIS_PATH: Path = DEFAULT_DIR / "analysis/results/user/"
179 | 
180 | USER_PROBES_TO_ANCHORS_RESULT_FILE: Path = (
181 |     USER_ANALYSIS_PATH / "cbg_thresholds_probes_to_anchors.json"
182 | )
183 | USER_VP_SELECTION_ALGORITHM_PROBES_1_FILE: Path = (
184 |     USER_ANALYSIS_PATH / "vp_selection_algorithm_probes_1.json"
185 | )
186 | USER_VP_SELECTION_ALGORITHM_PROBES_3_FILE: Path = (
187 |     USER_ANALYSIS_PATH / "vp_selection_algorithm_probes_3.json"
188 | )
189 | USER_VP_SELECTION_ALGORITHM_PROBES_10_FILE: Path = (
190 |     USER_ANALYSIS_PATH / "vp_selection_algoxrithm_probes_10.json"
191 | )
192 | USER_ACCURACY_VS_N_VPS_PROBES_FILE: Path = (
193 |     USER_ANALYSIS_PATH / "accuracy_vs_n_vps_probes.json"
194 | )
195 | USER_ROUND_BASED_ALGORITHM_FILE: Path = (
196 |     USER_ANALYSIS_PATH / "round_based_algorithm_error_cdf.json"
197 | )
198 | 
199 | ##################################################################################################
200 | # MEASUREMENTS RESULTS FILES                                                                     #
201 | ##################################################################################################
202 | MEASUREMENTS_MILLION_SCALE_PATH: Path = (
203 |     DEFAULT_DIR / "measurements/results/million_scale/"
204 | )
205 | MEASUREMENTS_STREET_LEVEL_PATH: Path = (
206 |     DEFAULT_DIR / "measurements/results/street_level/"
207 | )
208 | MEASUREMENT_CONFIG_PATH: Path = (
209 |     DEFAULT_DIR / "measurements/results/million_scale/measurement_config/"
210 | )
211 | 
212 | ############## MILLION SCALE FILES
213 | PREFIX_MEASUREMENT_RESULTS: Path = (
214 |     MEASUREMENTS_MILLION_SCALE_PATH / "prefix_measurement_results.json"
215 | )
216 | TARGET_MEASUREMENT_RESULTS: Path = (
217 |     MEASUREMENTS_MILLION_SCALE_PATH / "target_measurement_results.json"
218 | )
219 | 
220 | ############## STREET LEVEL FILES
221 | ANALYZABLE_FILE: Path = MEASUREMENTS_STREET_LEVEL_PATH / "all_res.json"
222 | 
223 | 
224 | ##################################################################################################
225 | # FIGURES FILES                                                                                  #
226 | ##################################################################################################
227 | 
228 | # REPRODUCIBILITY
229 | REPRO_FIGURE_PATH: Path = DEFAULT_DIR / "analysis/figures/reproducibility"
230 | 
231 | REPRO_GEO_DATABASE_FILE: Path = REPRO_FIGURE_PATH / "geo_databases.pdf"
232 | REPRO_ACCURACY_VS_NB_VPS_FILE: Path = REPRO_FIGURE_PATH / "accuracy_vs_n_vps_probes.pdf"
233 | REPRO_ACCURACY_VS_SUBSET_SIZES_FILE: Path = (
234 |     REPRO_FIGURE_PATH / "accuracy_vs_subset_sizes.pdf"
235 | )
236 | REPRO_CBG_THRESHOLD_PROBES_FILE: Path = REPRO_FIGURE_PATH / "cbg_thresholds_probes.pdf"
237 | REPRO_CBG_THRESHOLD_VP_SELECTION_FILE: Path = (
238 |     REPRO_FIGURE_PATH / "cbg_thresholds_vp_selection.pdf"
239 | )
240 | REPRO_CBG_THRESHOLD_CONTINENT_FILE: Path = (
241 |     REPRO_FIGURE_PATH / "cbg_thresholds_continent.pdf"
242 | )
243 | REPRO_ROUND_ALGORITHM_ERROR_FILE: Path = REPRO_FIGURE_PATH / "round_algorithm_error.pdf"
244 | REPRO_CLOSE_LANDMARK_FILE: Path = REPRO_FIGURE_PATH / "cdf_close_landmark_check_log.pdf"
245 | REPRO_INVALID_RTT_FILE: Path = REPRO_FIGURE_PATH / "invalid_rtt.pdf"
246 | REPRO_TIME_TO_GEOLOCATE_FILE: Path = REPRO_FIGURE_PATH / "cdf_time_to_geolocate.pdf"
247 | REPRO_SCATTER_DISTANCE_FILE: Path = REPRO_FIGURE_PATH / "scatter_md_vs_d.pdf"
248 | REPRO_SCATTER_DENSITY_FILE: Path = REPRO_FIGURE_PATH / "scatter_density.pdf"
249 | REPRO_CDF_DENSITY_FILE: Path = REPRO_FIGURE_PATH / "cdf_density.pdf"
250 | 
251 | # FROM USER MEASUREMENTS
252 | USER_FIGURE_PATH: Path = DEFAULT_DIR / "analysis/figures/user"
253 | 
254 | REPRO_GEO_DATABASE_FILE: Path = USER_FIGURE_PATH / "geo_databases.pdf"
255 | USER_ACCURACY_VS_NB_VPS_FILE: Path = USER_FIGURE_PATH / "accuracy_vs_n_vps_probes.pdf"
256 | USER_ACCURACY_VS_SUBSET_SIZES_FILE: Path = (
257 |     USER_FIGURE_PATH / "accuracy_vs_subset_sizes.pdf"
258 | )
259 | USER_CBG_THRESHOLD_PROBES_FILE: Path = USER_FIGURE_PATH / "cbg_thresholds_probes.pdf"
260 | USER_CBG_THRESHOLD_VP_SELECTION_FILE: Path = (
261 |     USER_FIGURE_PATH / "cbg_thresholds_vp_selection.pdf"
262 | )
263 | USER_CBG_THRESHOLD_CONTINENT_FILE: Path = (
264 |     USER_FIGURE_PATH / "cbg_thresholds_continent.pdf"
265 | )
266 | USER_ROUND_ALGORITHM_ERROR_FILE: Path = USER_FIGURE_PATH / "round_algorithm_error.pdf"
267 | USER_CLOSE_LANDMARK_FILE: Path = USER_FIGURE_PATH / "cdf_close_landmark_check_log.pdf"
268 | USER_INVALID_RTT_FILE: Path = USER_FIGURE_PATH / "invalid_rtt.pdf"
269 | USER_TIME_TO_GEOLOCATE_FILE: Path = USER_FIGURE_PATH / "cdf_time_to_geolocate.pdf"
270 | USER_SCATTER_DISTANCE_FILE: Path = USER_FIGURE_PATH / "scatter_md_vs_d.pdf"
271 | USER_SCATTER_DENSITY_FILE: Path = USER_FIGURE_PATH / "scatter_density.pdf"
272 | USER_CDF_DENSITY_FILE: Path = USER_FIGURE_PATH / "cdf_density.pdf"
273 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | # pull the docker image
 2 | docker pull clickhouse/clickhouse-server:22.6
 3 | 
 4 | 
 5 | # start the server using docker
 6 | docker run --rm -d \
 7 |     -v ./clickhouse_files/data:/var/lib/clickhouse/ \
 8 |     -v ./clickhouse_files/logs:/var/log/clickhouse-server/ \
 9 |     -v ./clickhouse_files/users.d:/etc/clickhouse-server/users.d:ro \
10 |     -v ./clickhouse_files/init-db.sh:/docker-entrypoint-initdb.d/init-db.sh \
11 |     -p 8123:8123 \
12 |     -p 9000:9000 \
13 |     --ulimit nofile=262144:262144 \
14 |     clickhouse/clickhouse-server:22.6
15 | 
16 | # download clickhouse client binary
17 | curl https://clickhouse.com/ | sh
18 | mv clickhouse ./clickhouse_files/
19 | 
20 | # install source files
21 | poetry lock 
22 | poetry install
23 | 
24 | # run clickhouse db installer for table init
25 | poetry run python scripts/utils/clickhouse_installer.py
26 | 


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | logging.basicConfig(
 4 |     format="%(asctime)s::%(levelname)s:%(name)s:%(module)s:: %(message)s",
 5 |     level=logging.INFO,
 6 |     datefmt="%Y-%m-%d %H:%M:%S",
 7 | )
 8 | 
 9 | logger = logging.getLogger()
10 | 


--------------------------------------------------------------------------------
/measurements/landmark_traceroutes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Probing part 4\n",
  8 |     "\n",
  9 |     "Vantage points will probe the targets in a 3-step method, either by doing pings or traceroutes.  \n",
 10 |     "\n",
 11 |     "Vantage points are the Ripe Atlas anchors, then indireclty some online landmarks.  \n",
 12 |     "As always, targets are the anchors.  \n",
 13 |     "\n",
 14 |     "This notebook is an implementation of the street level method. Check the paper for more information.  \n",
 15 |     "To do after create_datasets.ipynb"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import traceback\n",
 25 |     "\n",
 26 |     "from pprint import pprint\n",
 27 |     "from clickhouse_driver import Client\n",
 28 |     "\n",
 29 |     "from scripts.utils.file_utils import load_json, dump_json\n",
 30 |     "from scripts.utils.measurement_utils import load_vps\n",
 31 |     "from scripts.utils.helpers import haversine\n",
 32 |     "from scripts.street_level.traceroutes_results import serialize\n",
 33 |     "from scripts.street_level.three_tiers import get_all_info_geoloc\n",
 34 |     "from default import USER_ANCHORS_FILE, ANALYZABLE_FILE\n",
 35 |     "\n",
 36 |     "NB_VP = 10"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "### database for traceroutes"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "### Main\n",
 51 |     "\n",
 52 |     "This would take a lot of time (more than a day if you use all the VPs)"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Anchors are the targets and Vantage points\n",
 62 |     "anchors = load_vps(USER_ANCHORS_FILE, nb_vps=NB_VP)\n",
 63 |     "try:\n",
 64 |     "    all_res = load_json(ANALYZABLE_FILE)\n",
 65 |     "except FileNotFoundError:\n",
 66 |     "    all_res = {}\n",
 67 |     "\n",
 68 |     "i = 0\n",
 69 |     "for target in anchors.values():\n",
 70 |     "    try:\n",
 71 |     "        target_ip = target['address_v4']\n",
 72 |     "        if target_ip in all_res: # we skip targets already geolocated\n",
 73 |     "            continue\n",
 74 |     "        print(f\"{i}:{target_ip}\")\n",
 75 |     "        i += 1\n",
 76 |     "\n",
 77 |     "        res = get_all_info_geoloc(target_ip, vps=anchors.values())\n",
 78 |     "        res = serialize(res)\n",
 79 |     "        # We save the coordinates of the targets as given by RIPE Atlas\n",
 80 |     "        res['RIPE:lat'] = target['geometry']['coordinates'][1]\n",
 81 |     "        res['RIPE:lon'] = target['geometry']['coordinates'][0]\n",
 82 |     "\n",
 83 |     "        # We save the error of the estimated geolocation at each step\n",
 84 |     "        if res['lat'] != None and res['lon'] != None:\n",
 85 |     "            res['error'] = haversine(\n",
 86 |     "                (res['lat'], res['lon']), (res['RIPE:lat'], res['RIPE:lon']))\n",
 87 |     "        if 'tier1:lat' in res and 'tier1:lon' in res and res['tier1:lat'] != None and res['tier1:lon'] != None:\n",
 88 |     "            res['tier1:error'] = haversine(\n",
 89 |     "                (res['tier1:lat'], res['tier1:lon']), (res['RIPE:lat'], res['RIPE:lon']))\n",
 90 |     "        if 'tier2:lat' in res and 'tier2:lon' in res and res['tier2:lat'] != None and res['tier2:lon'] != None:\n",
 91 |     "            res['tier2:error'] = haversine(\n",
 92 |     "                (res['tier2:lat'], res['tier2:lon']), (res['RIPE:lat'], res['RIPE:lon']))\n",
 93 |     "        if 'tier3:lat' in res and 'tier3:lon' in res and res['tier3:lat'] != None and res['tier3:lon'] != None:\n",
 94 |     "            res['tier3:error'] = haversine(\n",
 95 |     "                (res['tier3:lat'], res['tier3:lon']), (res['RIPE:lat'], res['RIPE:lon']))\n",
 96 |     "\n",
 97 |     "        all_res[target_ip] = res\n",
 98 |     "        # We save the results\n",
 99 |     "        dump_json(all_res, ANALYZABLE_FILE)\n",
100 |     "    except Exception:\n",
101 |     "        traceback.print_exc()"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "### Geolocat one IP"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {},
115 |    "outputs": [
116 |     {
117 |      "name": "stdout",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "{'target_ip': '195.83.132.129', 'tier1:done': False, 'tier2:done': False, 'tier3:done': False, 'negative_rtt_included': True, 'speed_threshold': 0.6666666666666666, 'tier1:lat': None, 'tier1:lon': None, 'vps': set(), 'tier1:duration': 1282.0457310676575, 'lat': None, 'lon': None}\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "target_ip = '195.83.132.129' # LAAS/CNRS\n",
126 |     "geolocation = get_all_info_geoloc(target_ip)\n",
127 |     "#geolocation = geoloc(target_ip)\n",
128 |     "print(geolocation)\n",
129 |     "geolocation = serialize(geolocation)\n",
130 |     "dump_json(geolocation, 'res_tmp.json')"
131 |    ]
132 |   }
133 |  ],
134 |  "metadata": {
135 |   "kernelspec": {
136 |    "display_name": "review-8XQ99qZ1-py3.10",
137 |    "language": "python",
138 |    "name": "python3"
139 |   },
140 |   "language_info": {
141 |    "codemirror_mode": {
142 |     "name": "ipython",
143 |     "version": 3
144 |    },
145 |    "file_extension": ".py",
146 |    "mimetype": "text/x-python",
147 |    "name": "python",
148 |    "nbconvert_exporter": "python",
149 |    "pygments_lexer": "ipython3",
150 |    "version": "3.9.13"
151 |   },
152 |   "orig_nbformat": 4
153 |  },
154 |  "nbformat": 4,
155 |  "nbformat_minor": 2
156 | }
157 | 


--------------------------------------------------------------------------------
/measurements/million_scale_measurements.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": [
  7 |                 "# Probing part 1\n",
  8 |                 "\n",
  9 |                 "Vantage points will probe either the targets themselves (step 2) or other addresses in the same /24 prefix (step 2).\n",
 10 |                 "\n",
 11 |                 "Vantage points are only the anchors.  \n",
 12 |                 "As always, targets are the anchors.  \n",
 13 |                 "\n",
 14 |                 "This notebook is an implementation of the million scale method. Check the paper for more information.  \n",
 15 |                 "To do after create_datasets.ipynb"
 16 |             ]
 17 |         },
 18 |         {
 19 |             "cell_type": "code",
 20 |             "execution_count": 1,
 21 |             "metadata": {},
 22 |             "outputs": [],
 23 |             "source": [
 24 |                 "import uuid\n",
 25 |                 "\n",
 26 |                 "from logger import logger\n",
 27 |                 "from scripts.utils.file_utils import load_json\n",
 28 |                 "from scripts.utils.measurement_utils import (\n",
 29 |                 "    load_targets,\n",
 30 |                 "    load_vps,\n",
 31 |                 "    get_measurement_config,\n",
 32 |                 "    save_measurement_config,\n",
 33 |                 "    get_target_prefixes,\n",
 34 |                 "    ping_prefixes,\n",
 35 |                 "    ping_targets,\n",
 36 |                 ")\n",
 37 |                 "from default import (\n",
 38 |                 "    USER_ANCHORS_FILE,\n",
 39 |                 "    USER_HITLIST_FILE,\n",
 40 |                 "    MEASUREMENT_CONFIG_PATH,\n",
 41 |                 ")\n",
 42 |                 "\n",
 43 |                 "# will define the number of vps and targets to use\n",
 44 |                 "NB_TARGETS = 2\n",
 45 |                 "NB_VPS = 4"
 46 |             ]
 47 |         },
 48 |         {
 49 |             "cell_type": "markdown",
 50 |             "metadata": {},
 51 |             "source": [
 52 |                 "## Load targets and vps dataset"
 53 |             ]
 54 |         },
 55 |         {
 56 |             "cell_type": "code",
 57 |             "execution_count": 2,
 58 |             "metadata": {},
 59 |             "outputs": [],
 60 |             "source": [
 61 |                 "targets = load_targets(USER_ANCHORS_FILE, nb_target=NB_TARGETS)\n",
 62 |                 "vps = load_vps(USER_ANCHORS_FILE, nb_vps=NB_VPS)\n",
 63 |                 "\n",
 64 |                 "# every anchors /24 subnet\n",
 65 |                 "target_addrs = [t[\"address_v4\"] for t in targets]\n",
 66 |                 "target_prefixes = get_target_prefixes(target_addrs)\n",
 67 |                 "\n",
 68 |                 "# responsive IP addresses in all /24 prefixes\n",
 69 |                 "targets_per_prefix = load_json(USER_HITLIST_FILE)\n",
 70 |                 "\n",
 71 |                 "logger.info(f\"nb targets: {len(targets)}\")\n",
 72 |                 "logger.info(f\"nb_vps : {len(vps)}\")"
 73 |             ]
 74 |         },
 75 |         {
 76 |             "cell_type": "markdown",
 77 |             "metadata": {},
 78 |             "source": [
 79 |                 "## Generate measurement config\n",
 80 |                 "\n",
 81 |                 "This configuration is used to retrieve all measurements results from RIPE Atlas using their API."
 82 |             ]
 83 |         },
 84 |         {
 85 |             "cell_type": "code",
 86 |             "execution_count": 3,
 87 |             "metadata": {},
 88 |             "outputs": [
 89 |                 {
 90 |                     "name": "stderr",
 91 |                     "output_type": "stream",
 92 |                     "text": [
 93 |                         "2024-10-04 13:13:06::INFO:root:3171606573:: Starting experiment with uuid : c78efe35-8089-41a9-9206-ac7bac4a8a68\n"
 94 |                     ]
 95 |                 }
 96 |             ],
 97 |             "source": [
 98 |                 "# measurement configuration for retrieval\n",
 99 |                 "experiment_uuid = str(uuid.uuid4())\n",
100 |                 "target_measurement_uuid = str(uuid.uuid4())\n",
101 |                 "prefix_measurement_uuid = str(uuid.uuid4())\n",
102 |                 "\n",
103 |                 "config_file_path = MEASUREMENT_CONFIG_PATH / f\"{experiment_uuid}.json\"\n",
104 |                 "\n",
105 |                 "logger.info(f\"Starting experiment with uuid : {experiment_uuid}\")\n",
106 |                 "\n",
107 |                 "measurement_config = get_measurement_config(\n",
108 |                 "    experiment_uuid=experiment_uuid,\n",
109 |                 "    target_measurement_uuid=target_measurement_uuid,\n",
110 |                 "    prefix_measurement_uuid=prefix_measurement_uuid,\n",
111 |                 "    targets=targets,\n",
112 |                 "    target_prefixes=target_prefixes,\n",
113 |                 "    vps=vps,\n",
114 |                 ")\n",
115 |                 "\n",
116 |                 "save_measurement_config(measurement_config, config_file_path)"
117 |             ]
118 |         },
119 |         {
120 |             "attachments": {},
121 |             "cell_type": "markdown",
122 |             "metadata": {},
123 |             "source": [
124 |                 "# Step 1: probing each target prefixes"
125 |             ]
126 |         },
127 |         {
128 |             "attachments": {},
129 |             "cell_type": "markdown",
130 |             "metadata": {},
131 |             "source": [
132 |                 "## Probe target prefixes\n",
133 |                 "WARNING : Time consumming section"
134 |             ]
135 |         },
136 |         {
137 |             "cell_type": "code",
138 |             "execution_count": 4,
139 |             "metadata": {},
140 |             "outputs": [
141 |                 {
142 |                     "name": "stderr",
143 |                     "output_type": "stream",
144 |                     "text": [
145 |                         "2024-10-04 13:13:06::INFO:root:measurement_utils:: No cached results available\n",
146 |                         "2024-10-04 13:13:06::INFO:root:measurement_utils:: Starting measurements dd2e9428-762d-4353-99ca-613057d430a3 with parameters: dry_run=False; nb_targets=2; nb_vps=4.\n",
147 |                         "2024-10-04 13:13:06::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942232\n",
148 |                         "2024-10-04 13:13:08::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942233\n",
149 |                         "2024-10-04 13:13:08::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942234\n",
150 |                         "2024-10-04 13:13:08::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942235\n",
151 |                         "2024-10-04 13:13:09::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942236\n",
152 |                         "2024-10-04 13:13:09::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942237\n",
153 |                         "2024-10-04 13:13:09::INFO:root:ping_and_traceroute_classes:: measurement : dd2e9428-762d-4353-99ca-613057d430a3 done\n"
154 |                     ]
155 |                 }
156 |             ],
157 |             "source": [
158 |                 "ping_prefixes(\n",
159 |                 "    measurement_uuid=prefix_measurement_uuid,\n",
160 |                 "    measurement_config=measurement_config,\n",
161 |                 "    target_prefixes=target_prefixes,\n",
162 |                 "    targets_per_prefix=targets_per_prefix,\n",
163 |                 "    vps=vps,\n",
164 |                 ")\n",
165 |                 "\n",
166 |                 "save_measurement_config(measurement_config, config_file_path)"
167 |             ]
168 |         },
169 |         {
170 |             "attachments": {},
171 |             "cell_type": "markdown",
172 |             "metadata": {},
173 |             "source": [
174 |                 "# Step 2: probing each target"
175 |             ]
176 |         },
177 |         {
178 |             "attachments": {},
179 |             "cell_type": "markdown",
180 |             "metadata": {},
181 |             "source": [
182 |                 "## Probe targets\n",
183 |                 "WARNING : Time consumming section"
184 |             ]
185 |         },
186 |         {
187 |             "cell_type": "code",
188 |             "execution_count": 6,
189 |             "metadata": {},
190 |             "outputs": [
191 |                 {
192 |                     "name": "stderr",
193 |                     "output_type": "stream",
194 |                     "text": [
195 |                         "2024-10-04 13:13:11::INFO:root:measurement_utils:: Starting measurements 6796bfe3-7137-43f1-9f9f-71e0a141157d with parameters: dry_run=False; nb_targets=6; nb_vps=4.\n",
196 |                         "2024-10-04 13:13:12::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942244\n",
197 |                         "2024-10-04 13:13:12::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942245\n",
198 |                         "2024-10-04 13:13:12::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942246\n",
199 |                         "2024-10-04 13:13:13::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942247\n",
200 |                         "2024-10-04 13:13:13::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942248\n",
201 |                         "2024-10-04 13:13:14::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942249\n",
202 |                         "2024-10-04 13:13:14::INFO:root:ping_and_traceroute_classes:: measurement : 6796bfe3-7137-43f1-9f9f-71e0a141157d done\n"
203 |                     ]
204 |                 }
205 |             ],
206 |             "source": [
207 |                 "# measurement configuration for retrieval\n",
208 |                 "ping_targets(\n",
209 |                 "    measurement_uuid=target_measurement_uuid,\n",
210 |                 "    measurement_config=measurement_config,\n",
211 |                 "    targets=targets,\n",
212 |                 "    vps=vps,\n",
213 |                 "    use_cache=False,\n",
214 |                 ")\n",
215 |                 "save_measurement_config(measurement_config, config_file_path)"
216 |             ]
217 |         },
218 |         {
219 |             "attachments": {},
220 |             "cell_type": "markdown",
221 |             "metadata": {},
222 |             "source": [
223 |                 "## Retrieve prefix results\n",
224 |                 "WARNING : Time consuming section\n",
225 |                 "\n",
226 |                 "Note: it might take some time before measurement results are available through RIPE API. If no results are available, retry after a few minutes (or hours, it might really depends on the probe itself)."
227 |             ]
228 |         },
229 |         {
230 |             "cell_type": "code",
231 |             "execution_count": 7,
232 |             "metadata": {},
233 |             "outputs": [],
234 |             "source": [
235 |                 "from logger import logger\n",
236 |                 "from scripts.utils.file_utils import load_json\n",
237 |                 "from scripts.utils.measurement_utils import (\n",
238 |                 "    retrieve_results,\n",
239 |                 "    insert_prefix_results,\n",
240 |                 "    insert_target_results,\n",
241 |                 ")\n",
242 |                 "from default import (\n",
243 |                 "    PREFIX_MEASUREMENT_RESULTS,\n",
244 |                 "    TARGET_MEASUREMENT_RESULTS,\n",
245 |                 ")\n",
246 |                 "\n",
247 |                 "# will define the number of vps and targets to use\n",
248 |                 "NB_TARGETS = 2\n",
249 |                 "NB_VPS = 4"
250 |             ]
251 |         },
252 |         {
253 |             "cell_type": "code",
254 |             "execution_count": 8,
255 |             "metadata": {},
256 |             "outputs": [
257 |                 {
258 |                     "name": "stderr",
259 |                     "output_type": "stream",
260 |                     "text": [
261 |                         "2024-10-04 13:13:14::INFO:root:3539837011:: {'experiment_uuid': 'c78efe35-8089-41a9-9206-ac7bac4a8a68', 'status': 'ongoing', 'start_time': '2024-10-04 13:13:06.112516', 'end_time': None, 'is_dry_run': False, 'nb_targets': 2, 'nb_vps': 4, 'description': 'measurements from a set of vps towards all targets/target prefixes', 'af': 4, 'target_measurements': {'measurement_uuid': '6796bfe3-7137-43f1-9f9f-71e0a141157d', 'targets': ['103.196.37.98', '195.246.236.1', '77.220.233.1', '185.230.79.16', '185.34.2.114', '217.25.179.62'], 'vps': {'77.220.233.1': {'id': 6824, 'address_v4': '77.220.233.1', 'asn_v4': 42699, 'country_code': 'DE', 'geometry': {'type': 'Point', 'coordinates': [13.7285, 51.0395]}}, '185.230.79.16': {'id': 7122, 'address_v4': '185.230.79.16', 'asn_v4': 204515, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.1585, 48.7085]}}, '185.34.2.114': {'id': 6798, 'address_v4': '185.34.2.114', 'asn_v4': 36236, 'country_code': 'AE', 'geometry': {'type': 'Point', 'coordinates': [55.8115, 25.6315]}}, '217.25.179.62': {'id': 7042, 'address_v4': '217.25.179.62', 'asn_v4': 24776, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.3695, 48.9085]}}}, 'end_time': 1728047594.2916105, 'start_time': 1728047591.8001034}, 'prefix_measurements': {'measurement_uuid': 'dd2e9428-762d-4353-99ca-613057d430a3', 'targets': ['103.196.37.0', '195.246.236.0'], 'vps': {'77.220.233.1': {'id': 6824, 'address_v4': '77.220.233.1', 'asn_v4': 42699, 'country_code': 'DE', 'geometry': {'type': 'Point', 'coordinates': [13.7285, 51.0395]}}, '185.230.79.16': {'id': 7122, 'address_v4': '185.230.79.16', 'asn_v4': 204515, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.1585, 48.7085]}}, '185.34.2.114': {'id': 6798, 'address_v4': '185.34.2.114', 'asn_v4': 36236, 'country_code': 'AE', 'geometry': {'type': 'Point', 'coordinates': [55.8115, 25.6315]}}, '217.25.179.62': {'id': 7042, 'address_v4': '217.25.179.62', 'asn_v4': 24776, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.3695, 48.9085]}}}, 'end_time': 1728047589.574289, 'start_time': 1728047586.1349247}, 'meshed_measurements': {'measurement_uuid': '805d6778-9e09-4be7-9c43-d4aafc813a10', 'targets': ['103.196.37.98', '195.246.236.1', '77.220.233.1', '185.230.79.16', '185.34.2.114', '217.25.179.62'], 'vps': {'77.220.233.1': {'id': 6824, 'address_v4': '77.220.233.1', 'asn_v4': 42699, 'country_code': 'DE', 'geometry': {'type': 'Point', 'coordinates': [13.7285, 51.0395]}}, '185.230.79.16': {'id': 7122, 'address_v4': '185.230.79.16', 'asn_v4': 204515, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.1585, 48.7085]}}, '185.34.2.114': {'id': 6798, 'address_v4': '185.34.2.114', 'asn_v4': 36236, 'country_code': 'AE', 'geometry': {'type': 'Point', 'coordinates': [55.8115, 25.6315]}}, '217.25.179.62': {'id': 7042, 'address_v4': '217.25.179.62', 'asn_v4': 24776, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.3695, 48.9085]}}}, 'end_time': 1728047591.7847333, 'start_time': 1728047589.5959833}}\n"
262 |                     ]
263 |                 }
264 |             ],
265 |             "source": [
266 |                 "measurement_config = load_json(config_file_path)\n",
267 |                 "logger.info(measurement_config)"
268 |             ]
269 |         },
270 |         {
271 |             "cell_type": "code",
272 |             "execution_count": 9,
273 |             "metadata": {},
274 |             "outputs": [
275 |                 {
276 |                     "name": "stderr",
277 |                     "output_type": "stream",
278 |                     "text": [
279 |                         "2024-10-04 13:13:14::INFO:root:1680719454:: retrieving results for measurement ids: dd2e9428-762d-4353-99ca-613057d430a3\n",
280 |                         "2024-10-04 13:13:15::INFO:root:measurement_utils:: nb measurements retrieved: 0 for measurement_uuid : dd2e9428-762d-4353-99ca-613057d430a3\n"
281 |                     ]
282 |                 },
283 |                 {
284 |                     "ename": "UnboundLocalError",
285 |                     "evalue": "local variable 'result' referenced before assignment",
286 |                     "output_type": "error",
287 |                     "traceback": [
288 |                         "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
289 |                         "\u001b[0;31mUnboundLocalError\u001b[0m                         Traceback (most recent call last)",
290 |                         "Cell \u001b[0;32mIn[9], line 8\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;66;03m# sometimes, not all probes give output, reduce timeout if you do not want to wait for too long\u001b[39;00m\n\u001b[1;32m      6\u001b[0m response \u001b[38;5;241m=\u001b[39m retrieve_results(prefix_measurement_uuid, PREFIX_MEASUREMENT_RESULTS)\n\u001b[0;32m----> 8\u001b[0m \u001b[43minsert_prefix_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresponse\u001b[49m\u001b[43m)\u001b[49m\n",
291 |                         "File \u001b[0;32m/storage/hugo/geoloc-imc-2023/scripts/utils/measurement_utils.py:324\u001b[0m, in \u001b[0;36minsert_prefix_results\u001b[0;34m(results)\u001b[0m\n\u001b[1;32m    319\u001b[0m values_description \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    320\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msrc, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    321\u001b[0m )\n\u001b[1;32m    323\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m results:\n\u001b[0;32m--> 324\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mno data to insert, data = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mresult\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    326\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m result \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[1;32m    327\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    328\u001b[0m         \u001b[38;5;66;03m# parse response\u001b[39;00m\n",
292 |                         "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'result' referenced before assignment"
293 |                     ]
294 |                 }
295 |             ],
296 |             "source": [
297 |                 "prefix_measurement_uuid = measurement_config[\"prefix_measurements\"][\"measurement_uuid\"]\n",
298 |                 "\n",
299 |                 "logger.info(f\"retrieving results for measurement ids: {prefix_measurement_uuid}\")\n",
300 |                 "\n",
301 |                 "# sometimes, not all probes give output, reduce timeout if you do not want to wait for too long\n",
302 |                 "response = retrieve_results(prefix_measurement_uuid, PREFIX_MEASUREMENT_RESULTS)\n",
303 |                 "\n",
304 |                 "insert_prefix_results(response)"
305 |             ]
306 |         },
307 |         {
308 |             "attachments": {},
309 |             "cell_type": "markdown",
310 |             "metadata": {},
311 |             "source": [
312 |                 "## Retrieve traget results\n",
313 |                 "WARNING : Time consumming section\n",
314 |                 "\n",
315 |                 "Note: it might take some time before measurement results are available through RIPE API. If no results are available, retry after a few minutes (or hours, it might really depends on the probe itself)."
316 |             ]
317 |         },
318 |         {
319 |             "cell_type": "code",
320 |             "execution_count": 13,
321 |             "metadata": {},
322 |             "outputs": [
323 |                 {
324 |                     "name": "stderr",
325 |                     "output_type": "stream",
326 |                     "text": [
327 |                         "2024-10-03 18:08:53::INFO:root:3802694766:: retrieving results for measurement ids: 18020ef4-fcc5-410b-9eb1-9ab3a18dd3a3\n",
328 |                         "2024-10-03 18:08:53::INFO:root:measurement_utils:: nb measurements retrieved: 20 for measurement_uuid : 18020ef4-fcc5-410b-9eb1-9ab3a18dd3a3\n",
329 |                         "2024-10-03 18:08:53::INFO:root:measurement_utils:: Target measurements successfully inserted in table : user_vps_to_target\n"
330 |                     ]
331 |                 }
332 |             ],
333 |             "source": [
334 |                 "target_measurement_uuid = measurement_config[\"target_measurements\"][\"measurement_uuid\"]\n",
335 |                 "\n",
336 |                 "logger.info(f\"retrieving results for measurement ids: {target_measurement_uuid}\")\n",
337 |                 "\n",
338 |                 "response = retrieve_results(target_measurement_uuid, TARGET_MEASUREMENT_RESULTS)\n",
339 |                 "\n",
340 |                 "insert_target_results(response)"
341 |             ]
342 |         }
343 |     ],
344 |     "metadata": {
345 |         "kernelspec": {
346 |             "display_name": "geoloc-imc-2023-GZT64Hva-py3.10",
347 |             "language": "python",
348 |             "name": "python3"
349 |         },
350 |         "language_info": {
351 |             "codemirror_mode": {
352 |                 "name": "ipython",
353 |                 "version": 3
354 |             },
355 |             "file_extension": ".py",
356 |             "mimetype": "text/x-python",
357 |             "name": "python",
358 |             "nbconvert_exporter": "python",
359 |             "pygments_lexer": "ipython3",
360 |             "version": "3.10.12"
361 |         },
362 |         "orig_nbformat": 4
363 |     },
364 |     "nbformat": 4,
365 |     "nbformat_minor": 2
366 | }
367 | 


--------------------------------------------------------------------------------
/measurements/million_scale_measurements.py:
--------------------------------------------------------------------------------
  1 | """perform a meshed ping measurement where each VP is probed by every others"""
  2 | 
  3 | from logger import logger
  4 | 
  5 | from scripts.utils.file_utils import load_json
  6 | from scripts.utils.measurement_utils import (
  7 |     load_targets,
  8 |     load_vps,
  9 |     get_measurement_config,
 10 |     save_measurement_config,
 11 |     get_target_prefixes,
 12 |     ping_prefixes,
 13 |     ping_targets,
 14 |     retrieve_results,
 15 |     insert_prefix_results,
 16 |     insert_target_results,
 17 | )
 18 | from default import (
 19 |     USER_ANCHORS_FILE,
 20 |     USER_HITLIST_FILE,
 21 |     PREFIX_MEASUREMENT_RESULTS,
 22 |     TARGET_MEASUREMENT_RESULTS,
 23 |     MEASUREMENT_CONFIG_PATH,
 24 | )
 25 | 
 26 | # Small number of targets and VPs for testing
 27 | # Change to real Anchors and VPs values for complete measurement
 28 | NB_TARGETS = 5
 29 | NB_VPS = 10
 30 | 
 31 | # measurement configuration for retrieval,
 32 | # replace if you want to create new batch of measurements
 33 | EXPERIMENT_UUID = "3992e46c-73cf-4a7b-9428-3198856039a9"
 34 | TARGET_MESAUREMENT_UUID = "03eb9559-88fe-41cb-b62c-4c07d1d5acb8"
 35 | PREFIX_MESAUREMENT_UUID = "a09709aa-be76-4687-852e-64e8090bee70"
 36 | CONFIG_PATH = MEASUREMENT_CONFIG_PATH / f"{EXPERIMENT_UUID}.json"
 37 | 
 38 | 
 39 | def main_measurements() -> None:
 40 |     """perform all measurements related with million scale"""
 41 |     # set any of these var to execute the corresponding fct
 42 |     do_target_pings = True
 43 |     do_target_prefix_pings = True
 44 | 
 45 |     # load targets and VPs
 46 |     targets = load_targets(USER_ANCHORS_FILE, nb_target=NB_TARGETS)
 47 |     vps = load_vps(USER_ANCHORS_FILE, nb_vps=NB_VPS)
 48 | 
 49 |     # every anchors /24 subnet
 50 |     target_addrs = [t["address_v4"] for t in targets]
 51 |     target_prefixes = get_target_prefixes(target_addrs)
 52 |     # responsive IP addresses in all /24 prefixes
 53 |     targets_per_prefix = load_json(USER_HITLIST_FILE)
 54 | 
 55 |     logger.info(f"Starting experiment with uuid :: {EXPERIMENT_UUID}")
 56 |     logger.info(f"Config output                 :: {CONFIG_PATH}")
 57 | 
 58 |     # check if measurements measurement under this config uuid already exists
 59 |     if CONFIG_PATH.exists():
 60 |         logger.info(f"Loading existing measurement config:: {EXPERIMENT_UUID}")
 61 |         measurement_config = load_json(CONFIG_PATH)
 62 |     else:
 63 |         # create a new config is no existing one
 64 |         measurement_config = get_measurement_config(
 65 |             targets=targets,
 66 |             vps=vps,
 67 |             target_prefixes=target_prefixes,
 68 |             experiment_uuid=EXPERIMENT_UUID,
 69 |             target_measurement_uuid=TARGET_MESAUREMENT_UUID,
 70 |             prefix_measurement_uuid=PREFIX_MESAUREMENT_UUID,
 71 |         )
 72 |         save_measurement_config(measurement_config, CONFIG_PATH)
 73 | 
 74 |     if do_target_pings:
 75 |         vps.extend(targets)
 76 | 
 77 |         logger.info(f"Starting targets pigns :: {TARGET_MESAUREMENT_UUID}")
 78 |         logger.info(f"Nb targets             :: {len(targets)}")
 79 |         logger.info(f"Nb vps                 :: {len(vps)}")
 80 | 
 81 |         # measurement configuration for retrieval
 82 |         ping_targets(
 83 |             measurement_uuid=TARGET_MESAUREMENT_UUID,
 84 |             measurement_config=measurement_config,
 85 |             targets=targets,
 86 |             vps=vps,
 87 |             use_cache=True,
 88 |         )
 89 | 
 90 |         # update config
 91 |         save_measurement_config(measurement_config, CONFIG_PATH)
 92 | 
 93 |     if do_target_prefix_pings:
 94 |         logger.info(f"Starting prefix pigns :: {PREFIX_MESAUREMENT_UUID}")
 95 |         logger.info(f"Nb targets             :: {len(targets)}")
 96 |         logger.info(f"Nb prefixes            :: {len(target_prefixes)}")
 97 |         logger.info(f"Nb vps                 :: {len(vps)}")
 98 | 
 99 |         ping_prefixes(
100 |             vps=vps,
101 |             target_prefixes=target_prefixes,
102 |             targets_per_prefix=targets_per_prefix,
103 |             measurement_uuid=PREFIX_MESAUREMENT_UUID,
104 |             measurement_config=measurement_config,
105 |         )
106 | 
107 | 
108 | def main_retrieve_results() -> None:
109 |     """retrieve all measurement results related with million scale"""
110 |     retrieve_target_measurements = True
111 |     retrieve_prefix_measurements = True
112 | 
113 |     measurement_config = load_json(CONFIG_PATH)
114 |     logger.info(f"{measurement_config}")
115 | 
116 |     if retrieve_target_measurements:
117 |         target_measurement_uuid = measurement_config["target_measurements"][
118 |             "measurement_uuid"
119 |         ]
120 | 
121 |         logger.info(
122 |             f"retrieving results for measurement ids: {target_measurement_uuid}"
123 |         )
124 | 
125 |         # sometimes, not all probes give output, reduce timeout if you do not want to wait for too long
126 |         response = retrieve_results(TARGET_MESAUREMENT_UUID, TARGET_MEASUREMENT_RESULTS)
127 | 
128 |         # will output into user tables
129 |         insert_target_results(response)
130 | 
131 |     if retrieve_prefix_measurements:
132 |         prefix_measurement_uuid = measurement_config["prefix_measurements"][
133 |             "measurement_uuid"
134 |         ]
135 | 
136 |         logger.info(
137 |             f"retrieving results for measurement ids: {prefix_measurement_uuid}"
138 |         )
139 | 
140 |         # sometimes, not all probes give output, reduce timeout if you do not want to wait for too long
141 |         response = retrieve_results(TARGET_MESAUREMENT_UUID, PREFIX_MEASUREMENT_RESULTS)
142 | 
143 |         # will output into user tables
144 |         insert_prefix_results(response)
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     do_measurements = True
149 |     do_retrieve_results = True
150 | 
151 |     if do_measurements:
152 |         main_measurements()
153 | 
154 |     if do_retrieve_results:
155 |         main_retrieve_results()
156 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "geoscale"
 3 | version = "1.0"
 4 | description = "Geolocation reproduction paper"
 5 | authors = ["Danaelmilo <milo2.dreyfus@gmail.com>"]
 6 | readme = "README.md"
 7 | packages = [
 8 |     { include = "scripts" },
 9 | ]
10 | 
11 | [tool.poetry.dependencies]
12 | python = "^3.9"
13 | numpy = "^1.21.3"
14 | matplotlib = "^3.4.3"
15 | requests = "^2.17.0"
16 | clickhouse-driver = "^0.2.6"
17 | overpy = "^0.6"
18 | dnspython = "^2.4.1"
19 | geopy = "^2.3.0"
20 | ujson = "^5.8.0"
21 | scipy = "^1.5.0"
22 | geopandas = "^0.13.2"
23 | rasterio = "^1.3.8"
24 | ipykernel = "^6.24.0"
25 | jupyter = "^1.0.0"
26 | py-radix = "^0.10.0"
27 | pyasn = "^1.6.2"
28 | clickhouse_driver = "^0.2.2"
29 | python-dotenv="^0.20.0"
30 | 
31 | [tool.poetry.dev-dependencies]
32 | ipykernel = "^6.25.1"
33 | 
34 | [build-system]
35 | requires = ["poetry-core"]
36 | build-backend = "poetry.core.masonry.api"


--------------------------------------------------------------------------------
/scripts/ripe_atlas/atlas_api.py:
--------------------------------------------------------------------------------
  1 | # All functions to query RIPE Atlas API
  2 | 
  3 | import json
  4 | import time
  5 | import requests
  6 | import ipaddress
  7 | 
  8 | from collections import defaultdict, OrderedDict
  9 | from ipaddress import IPv4Network
 10 | from random import randint
 11 | 
 12 | from logger import logger
 13 | 
 14 | 
 15 | class RIPEAtlas(object):
 16 |     def __init__(
 17 |         self,
 18 |         account: str,
 19 |         key: str,
 20 |     ) -> None:
 21 |         self.account = account
 22 |         self.key = key
 23 | 
 24 |     def ping(
 25 |         self, target, vps, tag: str, nb_packets: int = 3, max_retry: int = 60
 26 |     ) -> None:
 27 |         """start ping measurement towards target from vps, return Atlas measurement id"""
 28 | 
 29 |         for _ in range(max_retry):
 30 |             response = requests.post(
 31 |                 f"https://atlas.ripe.net/api/v2/measurements/?key={self.key}",
 32 |                 json={
 33 |                     "definitions": [
 34 |                         {
 35 |                             "target": target,
 36 |                             "af": 4,
 37 |                             "packets": nb_packets,
 38 |                             "size": 48,
 39 |                             "tags": [tag],
 40 |                             "description": f"Dioptra Geolocation of {target}",
 41 |                             "resolve_on_probe": False,
 42 |                             "skip_dns_check": True,
 43 |                             "include_probe_id": False,
 44 |                             "type": "ping",
 45 |                         }
 46 |                     ],
 47 |                     "probes": [
 48 |                         {"value": vp, "type": "probes", "requested": 1} for vp in vps
 49 |                     ],
 50 |                     "is_oneoff": True,
 51 |                     "bill_to": self.account,
 52 |                 },
 53 |             ).json()
 54 | 
 55 |             try:
 56 |                 measurement_id = response["measurements"][0]
 57 |                 break
 58 |             except KeyError:
 59 |                 logger.info(response)
 60 |                 logger.warning("Too much measurements.", "Waiting.")
 61 |                 time.sleep(60)
 62 |         else:
 63 |             raise Exception("Too much measurements. Stopping.")
 64 | 
 65 |         if not response:
 66 |             return
 67 | 
 68 |         try:
 69 |             return measurement_id
 70 |         except (IndexError, KeyError):
 71 |             return
 72 | 
 73 |     def traceroute_measurement(self, target, probes_selector, options):
 74 |         ripe_key, description, tags, is_public, packets, protocol = options
 75 | 
 76 |         core_parameters = {
 77 |             "target": target,
 78 |             "af": 4,
 79 |             "description": description,
 80 |             "resolve_on_probe": False,
 81 |             "type": "traceroute",
 82 |             "tags": tags,
 83 |             "is_public": is_public,
 84 |         }
 85 | 
 86 |         traceroute_parameters = {
 87 |             "packets": packets,
 88 |             "protocol": protocol,
 89 |         }
 90 | 
 91 |         parameters = {}
 92 |         parameters.update(core_parameters)
 93 |         parameters.update(traceroute_parameters)
 94 | 
 95 |         definitions = [parameters]
 96 | 
 97 |         response = requests.post(
 98 |             f"https://atlas.ripe.net/api/v2/measurements/?key={ripe_key}",
 99 |             json={
100 |                 "definitions": definitions,
101 |                 "probes": [probes_selector],
102 |                 "is_oneoff": True,
103 |                 "bill_to": self.account,
104 |             },
105 |         ).json()
106 |         return response
107 | 
108 |     def __str__(self):
109 |         return "RIPE Atlas"
110 | 
111 | 
112 | def ripe_traceroute_to_csv(traceroute):
113 |     protocols = {"ICMP": 1, "TCP": 6, "UDP": 17}
114 |     rows = []
115 |     try:
116 |         src_addr = traceroute["from"]
117 |         dst_addr = traceroute["dst_addr"]
118 |         af = traceroute["af"]
119 |         if af == 4:
120 |             dst_prefix = ".".join(dst_addr.split(".")[:3] + ["0"])
121 |         elif af == 6:
122 |             dst_prefix = str(
123 |                 ipaddress.ip_network(dst_addr + "/48", strict=False).network_address
124 |             )
125 |     except (KeyError, ValueError):
126 |         return rows
127 | 
128 |     for hop in traceroute["result"]:
129 |         for response in hop.get("result", []):
130 |             if not response or response.get("error"):
131 |                 continue
132 |             if response.get("x") == "*" or not response.get("rtt"):
133 |                 response["from"] = "*"
134 |                 response["rtt"] = 0
135 |                 response["ttl"] = 0
136 |             proto = protocols[traceroute["proto"]]
137 |             try:
138 |                 row = (
139 |                     src_addr,
140 |                     dst_prefix,
141 |                     dst_addr,
142 |                     response["from"],
143 |                     proto,
144 |                     hop["hop"],
145 |                     response["rtt"],
146 |                     response["ttl"],
147 |                     traceroute["prb_id"],
148 |                     traceroute["msm_id"],
149 |                     traceroute["timestamp"],
150 |                 )
151 |                 row_str = "".join(f",{x}" for x in row)[1:]
152 |                 rows.append(row_str)
153 |             except Exception:
154 |                 print("ERROR", response)
155 | 
156 |     return rows
157 | 
158 | 
159 | def fetch_traceroutes_from_measurement_ids_no_csv(
160 |     measurement_ids, start=None, stop=None
161 | ):
162 |     res = []
163 |     for measurement_id in measurement_ids:
164 |         result_url = (
165 |             f"https://atlas.ripe.net/api/v2/measurements/{measurement_id}/results/?"
166 |         )
167 |         if start:
168 |             result_url += f"start={start}"
169 |         if stop:
170 |             result_url += f"&stop={stop}"
171 |         traceroutes = requests.get(result_url).json()
172 |         if "error" in traceroutes:
173 |             print(traceroutes)
174 |             continue
175 |         for traceroute in traceroutes:
176 |             rows = ripe_traceroute_to_csv(traceroute)
177 |             for row in rows:
178 |                 res.append(row)
179 |     return res
180 | 
181 | 
182 | def wait_for(measurement_id: str, max_retry: int = 30) -> None:
183 |     for _ in range(max_retry):
184 |         response = requests.get(
185 |             f"https://atlas.ripe.net/api/v2/measurements/{measurement_id}/"
186 |         ).json()
187 | 
188 |         # check if measurement is ongoing or not
189 |         if response["status"]["name"] != "Ongoing":
190 |             return response
191 | 
192 |         time.sleep(10)
193 | 
194 |     return None
195 | 
196 | 
197 | def get_prefix_from_ip(addr):
198 |     """from an ip addr return /24 prefix"""
199 |     prefix = addr.split(".")[:-1]
200 |     prefix.append("0")
201 |     prefix = ".".join(prefix)
202 |     return prefix
203 | 
204 | 
205 | def get_target_hitlist(target_prefix, nb_targets, targets_per_prefix):
206 |     """from ip, return a list of target ips"""
207 |     target_addr_list = []
208 |     try:
209 |         target_addr_list = targets_per_prefix[target_prefix]
210 |     except KeyError:
211 |         pass
212 | 
213 |     target_addr_list = list(set(target_addr_list))
214 | 
215 |     if len(target_addr_list) < nb_targets:
216 |         prefix = IPv4Network(target_prefix + "/24")
217 |         target_addr_list.extend(
218 |             [
219 |                 str(prefix[randint(1, 254)])
220 |                 for _ in range(0, nb_targets - len(target_addr_list))
221 |             ]
222 |         )
223 | 
224 |     if len(target_addr_list) > nb_targets:
225 |         target_addr_list = target_addr_list[:nb_targets]
226 | 
227 |     return target_addr_list
228 | 
229 | 
230 | def is_geoloc_disputed(probe: dict) -> bool:
231 |     """check if geoloc disputed flag is contained in probe metadata"""
232 | 
233 |     tags = probe["tags"]
234 |     for tag in tags:
235 |         if tag["slug"] == "system-geoloc-disputed":
236 |             return True
237 |     return False
238 | 
239 | 
240 | def get_measurement_url(measurement_id: int) -> str:
241 |     """return Atlas API url for get measurement request"""
242 | 
243 |     return f"https://atlas.ripe.net/api/v2/measurements/{measurement_id}/results/"
244 | 
245 | 
246 | def get_response(url: str, max_retry: int = 60, wait_time: int = 2) -> list:
247 |     """request to Atlas API"""
248 | 
249 |     for _ in range(max_retry):
250 |         response = requests.get(url)
251 | 
252 |         # small parsing, as response might not be Json formatted
253 |         try:
254 |             response = json.loads(response.content)
255 |         except json.JSONDecodeError:
256 |             response = response.content.decode()
257 |             response = response.replace("}{", "}, {")
258 |             response = response.replace("} {", "}, {")
259 |             response = json.loads(response)
260 | 
261 |         if response != []:
262 |             break
263 |         time.sleep(wait_time)
264 | 
265 |     return response
266 | 
267 | 
268 | def parse_measurements_results(response: list) -> dict:
269 |     """from get Atlas measurement request return parsed results"""
270 | 
271 |     # parse response
272 |     measurement_results = defaultdict(dict)
273 |     for result in response:
274 |         # parse results and calculate geoloc
275 |         if result.get("result") is not None:
276 |             dst_addr = result["dst_addr"]
277 |             vp_addr = result["from"]
278 | 
279 |             if type(result["result"]) == list:
280 |                 rtt_list = [list(rtt.values())[0] for rtt in result["result"]]
281 |             else:
282 |                 rtt_list = [result["result"]["rtt"]]
283 | 
284 |             # remove stars from results
285 |             rtt_list = list(filter(lambda x: x != "*", rtt_list))
286 |             if not rtt_list:
287 |                 continue
288 | 
289 |             # sometimes connection error with vantage point cause result to be string message
290 |             try:
291 |                 min_rtt = min(rtt_list)
292 |             except TypeError:
293 |                 continue
294 | 
295 |             if isinstance(min_rtt, str):
296 |                 continue
297 | 
298 |             measurement_results[dst_addr][vp_addr] = {
299 |                 "node": vp_addr,
300 |                 "min_rtt": min_rtt,
301 |                 "rtt_list": rtt_list,
302 |             }
303 | 
304 |         else:
305 |             logger.warning(f"no results: {result}")
306 | 
307 |     # order vps per increasing rtt
308 |     for dst_addr in measurement_results:
309 |         measurement_results[dst_addr] = OrderedDict(
310 |             {
311 |                 vp: results
312 |                 for vp, results in sorted(
313 |                     measurement_results[dst_addr].items(),
314 |                     key=lambda item: item[1]["min_rtt"],
315 |                 )
316 |             }
317 |         )
318 | 
319 |     return measurement_results
320 | 
321 | 
322 | def get_measurement_from_id(
323 |     measurement_id: int,
324 |     max_retry: int = 60,
325 |     wait_time: int = 10,
326 | ) -> dict:
327 |     """retrieve measurement results from RIPE Atlas with measurement id"""
328 | 
329 |     url = get_measurement_url(measurement_id)
330 | 
331 |     response = get_response(url, max_retry=max_retry, wait_time=wait_time)
332 | 
333 |     return response
334 | 
335 | 
336 | def get_measurements_from_tag(tag: str) -> dict:
337 |     """retrieve all measurements that share the same tag and return parsed measurement results"""
338 | 
339 |     url = f"https://atlas.ripe.net/api/v2/measurements/tags/{tag}/results/"
340 | 
341 |     response = get_response(url, max_retry=1, wait_time=1)
342 | 
343 |     return response
344 | 
345 | 
346 | def get_from_atlas(url: str):
347 |     """get request url atlas endpoint"""
348 |     response = requests.get(url).json()
349 |     while True:
350 |         for anchor in response["results"]:
351 |             yield anchor
352 | 
353 |         if response["next"]:
354 |             response = requests.get(response["next"]).json()
355 |         else:
356 |             break
357 | 
358 | 
359 | def get_atlas_probes() -> list:
360 |     """return all connected atlas probes"""
361 |     probes = []
362 |     rejected = 0
363 |     geoloc_disputed = 0
364 |     for _, probe in enumerate(get_from_atlas("https://atlas.ripe.net/api/v2/probes/")):
365 |         # filter probes based on generic criteria
366 |         if not probe["is_anchor"]:
367 |             if (
368 |                 probe["status"]["name"] != "Connected"
369 |                 or probe.get("geometry") is None
370 |                 or probe.get("address_v4") is None
371 |                 or probe.get("country_code") is None
372 |             ):
373 |                 rejected += 1
374 |                 continue
375 | 
376 |             if is_geoloc_disputed(probe):
377 |                 geoloc_disputed += 1
378 |                 continue
379 | 
380 |             reduced_probe = {
381 |                 "id": probe["id"],
382 |                 "address_v4": probe["address_v4"],
383 |                 "asn_v4": probe["asn_v4"],
384 |                 "country_code": probe["country_code"],
385 |                 "geometry": probe["geometry"],
386 |             }
387 |             probes.append(reduced_probe)
388 | 
389 |     return probes, rejected, geoloc_disputed
390 | 
391 | 
392 | def get_atlas_anchors() -> list:
393 |     """return all atlas anchors"""
394 |     anchors = []
395 |     rejected = 0
396 |     geoloc_disputed = 0
397 |     for _, anchor in enumerate(get_from_atlas("https://atlas.ripe.net/api/v2/probes/")):
398 |         # filter anchors based on generic criteria
399 |         if anchor["is_anchor"]:
400 |             if (
401 |                 anchor["status"]["name"] != "Connected"
402 |                 or anchor.get("geometry") is None
403 |                 or anchor.get("address_v4") is None
404 |                 or anchor.get("country_code") is None
405 |             ):
406 |                 rejected += 1
407 |                 continue
408 | 
409 |             if is_geoloc_disputed(anchor):
410 |                 geoloc_disputed += 1
411 |                 continue
412 | 
413 |             reduced_anchor = {
414 |                 "id": anchor["id"],
415 |                 "address_v4": anchor["address_v4"],
416 |                 "asn_v4": anchor["asn_v4"],
417 |                 "country_code": anchor["country_code"],
418 |                 "geometry": anchor["geometry"],
419 |                 "id": anchor["id"],
420 |             }
421 |             anchors.append(reduced_anchor)
422 | 
423 |     return anchors, rejected, geoloc_disputed
424 | 


--------------------------------------------------------------------------------
/scripts/ripe_atlas/ping_and_traceroute_classes.py:
--------------------------------------------------------------------------------
  1 | # Two classes to instantiate before calling RIPE Atlas API: one for ping measurements and one for traceroute measurements
  2 | 
  3 | import time
  4 | 
  5 | from pprint import pprint
  6 | from copy import copy
  7 | 
  8 | from logger import logger
  9 | from scripts.ripe_atlas.atlas_api import RIPEAtlas, wait_for, get_target_hitlist
 10 | from scripts.utils.credentials import get_ripe_atlas_credentials
 11 | 
 12 | 
 13 | MAX_NUMBER_OF_VPS = 1_000
 14 | NB_MAX_CONCURRENT_MEASUREMENTS = 90
 15 | NB_PACKETS = 3
 16 | NB_TARGETS_PER_PREFIX = 3
 17 | 
 18 | 
 19 | class PING:
 20 |     def __init__(
 21 |         self,
 22 |     ) -> None:
 23 |         ripe_credentials = get_ripe_atlas_credentials()
 24 | 
 25 |         self.account = ripe_credentials["username"]
 26 |         self.key = ripe_credentials["secret_key"]
 27 | 
 28 |         self.driver = RIPEAtlas(self.account, self.key)
 29 | 
 30 |     def ping_by_prefix(
 31 |         self,
 32 |         target_prefixes: list,
 33 |         vps: dict,
 34 |         targets_per_prefix: dict,
 35 |         tag: str,
 36 |         nb_packets: int = NB_PACKETS,
 37 |         nb_targets: int = NB_TARGETS_PER_PREFIX,
 38 |         dry_run: bool = False,
 39 |     ):
 40 |         """from a list of prefixes, start measurements for n target addrs in prefix"""
 41 | 
 42 |         active_measurements = []
 43 |         all_measurement_ids = []
 44 |         start_time = time.time()
 45 |         for i, target_prefix in enumerate(target_prefixes):
 46 | 
 47 |             logger.info(
 48 |                 f"Ping for target prefix:: {target_prefix}, {i+1}/{len(target_prefixes)}"
 49 |             )
 50 | 
 51 |             # get target_addr_list
 52 |             target_addr_list = get_target_hitlist(
 53 |                 target_prefix, nb_targets, targets_per_prefix
 54 |             )
 55 | 
 56 |             # get vps id for measurement, remove target if in vps
 57 | 
 58 |             logger.debug(
 59 |                 f"starting measurement for {target_prefix} with {[addr for addr in target_addr_list]}"
 60 |             )
 61 | 
 62 |             for target_addr in target_addr_list:
 63 |                 vp_ids = [vp["id"] for vp in vps if vp["address_v4"] != target_addr]
 64 |                 for i in range(0, len(vp_ids), MAX_NUMBER_OF_VPS):
 65 |                     subset_vp_ids = vp_ids[i : i + MAX_NUMBER_OF_VPS]
 66 | 
 67 |                     logger.debug(
 68 |                         f"starting measurement for {target_addr} with {len(subset_vp_ids)} vps"
 69 |                     )
 70 | 
 71 |                     if not dry_run:
 72 |                         measurement_id = self.driver.ping(
 73 |                             str(target_addr), subset_vp_ids, str(tag), nb_packets
 74 |                         )
 75 | 
 76 |                         logger.info(
 77 |                             f"measurement tag: {tag} : started measurement id : {measurement_id}"
 78 |                         )
 79 |                     else:
 80 |                         measurement_id = 404
 81 | 
 82 |                     active_measurements.append(measurement_id)
 83 |                     all_measurement_ids.append(measurement_id)
 84 | 
 85 |                     # check number of parallel measurements in not too high
 86 |                     if len(active_measurements) >= NB_MAX_CONCURRENT_MEASUREMENTS:
 87 |                         logger.info(
 88 |                             f"Reached limit for number of concurrent measurements: {len(active_measurements)}"
 89 |                         )
 90 |                         tmp_measurement_ids = copy(active_measurements)
 91 |                         for id in tmp_measurement_ids:
 92 |                             # wait for the last measurement of the batch to end before starting a new one
 93 |                             if not dry_run:
 94 |                                 measurement_result = wait_for(id)
 95 |                                 if measurement_result:
 96 |                                     active_measurements.remove(id)
 97 |                             else:
 98 |                                 active_measurements.remove(id)
 99 |                                 time.sleep(0.5)
100 | 
101 |         logger.info(f"measurement : {tag} done")
102 | 
103 |         end_time = time.time()
104 | 
105 |         return all_measurement_ids, start_time, end_time
106 | 
107 |     def ping_by_target(
108 |         self,
109 |         targets: list[dict],
110 |         vps: list[dict],
111 |         tag: str,
112 |         nb_packets: int = NB_PACKETS,
113 |         dry_run: bool = False,
114 |     ):
115 |         """from a list of prefixes, start measurements for n target addrs in prefix"""
116 | 
117 |         active_measurements = []
118 |         all_measurement_ids = []
119 |         start_time = time.time()
120 |         for i, target_addr in enumerate(targets):
121 |             logger.info(f"Ping for target:: {target_addr}, {i+1}/{len(targets)}")
122 | 
123 |             # get vps id for measurement, remove target if in vps
124 |             vp_ids = [vp["id"] for vp in vps if vp["address_v4"] != target_addr]
125 | 
126 |             for i in range(0, len(vp_ids), MAX_NUMBER_OF_VPS):
127 |                 subset_vp_ids = vp_ids[i : i + MAX_NUMBER_OF_VPS]
128 | 
129 |                 logger.debug(
130 |                     f"starting measurement for {target_addr} with {len(subset_vp_ids)} vps"
131 |                 )
132 | 
133 |                 if not dry_run:
134 |                     measurement_id = self.driver.ping(
135 |                         str(target_addr), subset_vp_ids, str(tag), nb_packets
136 |                     )
137 |                 else:
138 |                     measurement_id = 404
139 | 
140 |                 active_measurements.append(measurement_id)
141 |                 all_measurement_ids.append(measurement_id)
142 | 
143 |                 logger.info(
144 |                     f"measurement tag: {tag} : started measurement id : {measurement_id}"
145 |                 )
146 | 
147 |                 # check number of parallel measurements in not too high
148 |                 if len(active_measurements) >= NB_MAX_CONCURRENT_MEASUREMENTS:
149 |                     logger.info(
150 |                         f"Reached limit for number of concurrent measurements: {len(active_measurements)}"
151 |                     )
152 |                     tmp_measurement_ids = copy(active_measurements)
153 |                     for id in tmp_measurement_ids:
154 |                         # wait for the last measurement of the batch to end before starting a new one
155 |                         if not dry_run:
156 |                             measurement_result = wait_for(id)
157 |                             if measurement_result:
158 |                                 active_measurements.remove(id)
159 |                         else:
160 |                             active_measurements.remove(id)
161 |                             time.sleep(0.5)
162 | 
163 |         logger.info(f"measurement : {tag} done")
164 | 
165 |         end_time = time.time()
166 | 
167 |         return all_measurement_ids, start_time, end_time
168 | 
169 | 
170 | class TRACEROUTE:
171 |     def __init__(
172 |         self,
173 |     ) -> None:
174 |         ripe_credentials = get_ripe_atlas_credentials()
175 | 
176 |         self.account = ripe_credentials["username"]
177 |         self.key = ripe_credentials["secret_key"]
178 |         self.driver = RIPEAtlas(self.account, self.key)
179 | 
180 |     def traceroute(self, target, probe_id):
181 |         description = "Geoloc project"
182 |         tags = ["traceroute", "test", "geoloc"]
183 |         is_public = True
184 |         probes = {"value": str(probe_id), "type": "probes", "requested": 1}
185 |         packets = 3
186 |         protocol = "ICMP"
187 |         options = (self.key, description, tags, is_public, packets, protocol)
188 | 
189 |         response = self.driver.traceroute_measurement(target, probes, options)
190 | 
191 |         if "measurements" in response and len(response["measurements"]) == 1:
192 |             return response["measurements"][0]
193 |         else:
194 |             print(f"Failed to traceroute")
195 |             pprint(response)
196 |             return None
197 | 


--------------------------------------------------------------------------------
/scripts/street_level/landmark.py:
--------------------------------------------------------------------------------
  1 | # Do the landmark selection step as explained in the street level paper
  2 | 
  3 | import requests
  4 | import overpy
  5 | import dns
  6 | import dns.resolver
  7 | import urllib3
  8 | import pyasn
  9 | import warnings
 10 | 
 11 | from multiprocessing import Pool
 12 | from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 13 | from geopy import Point, distance
 14 | 
 15 | from scripts.utils.file_utils import load_json, dump_json
 16 | from default import CACHED_WEBSITES_FILE, IP_TO_ASN_FILE
 17 | 
 18 | 
 19 | warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning)
 20 | urllib3.disable_warnings()
 21 | 
 22 | 
 23 | def get_bounding_box(lat, lon):
 24 |     p = Point(lat, lon)
 25 |     d = distance.distance(kilometers=2).meters
 26 |     top_right = distance.distance(meters=d).destination(p, 45)
 27 |     bottom_left = distance.distance(meters=d).destination(p, 225)
 28 |     return (bottom_left.latitude, bottom_left.longitude, top_right.latitude, top_right.longitude)
 29 | 
 30 | 
 31 | def check_domain_name_ip(domain_name, ip_address, protocol):
 32 |     # print(f"Checking {domain_name}")
 33 |     ip_url = protocol + "://" + ip_address
 34 |     domain_url = protocol + "://" + domain_name
 35 |     try:
 36 |         ip_response = requests.get(ip_url, verify=False, timeout=1)
 37 |         if ip_response.status_code != 200:
 38 |             return False
 39 |         domain_response = requests.get(domain_url, timeout=1)
 40 |         if domain_response.status_code != 200:
 41 |             return False
 42 |     except Exception:
 43 |         # print(traceback.format_exc())
 44 |         return False
 45 | 
 46 |     try:
 47 |         ip_soup = BeautifulSoup(ip_response.content, "html.parser")
 48 |         domain_soup = BeautifulSoup(domain_response.content, "html.parser")
 49 |         ip_title = ip_soup.head.title.text
 50 |         domain_title = domain_soup.head.title.text
 51 |         if ip_title == domain_title:
 52 |             return True
 53 |         else:
 54 |             return False
 55 |     except:
 56 |         return False
 57 | 
 58 | 
 59 | def check_and_get_website_ip(website, protocol):
 60 |     asns = ['20940', '16625', '12222', '16625', '21342', '21399', '32787', '35994', '35993', '35995', '36408', '393234', '394689',
 61 |             '13335', '202018', '202109', '133293', '395747',
 62 |             '54113', '209242',
 63 |             '16509', '14618', '16509', '39111', '16509',
 64 |             '8075', '8075', '8075', '12076', '12222',
 65 |             '15169', '36351', '22577', '36040', '55023',
 66 |             '22822',
 67 |             '701', '22394, 11608, 11608',
 68 |             '3356', '133229, 133229, 395570',
 69 |             '60068', '136620', '395354',
 70 |             '32934']
 71 |     res = {}
 72 |     asndb = pyasn.pyasn(str(IP_TO_ASN_FILE))
 73 |     try:
 74 |         result = dns.resolver.resolve(website)
 75 |     except Exception:
 76 |         # print(traceback.format_exc())
 77 |         return {'dns-failed': True}
 78 |     if len(result) == 0:
 79 |         return {'dns-failed': True}
 80 |     res = {'dns-failed': False}
 81 | 
 82 |     ip = result[0].to_text()
 83 |     res['ip'] = ip
 84 |     asn = asndb.lookup(ip)[0]
 85 |     if asn == None:
 86 |         res['asn-found'] = False
 87 |         return res
 88 |     else:
 89 |         res['asn-found'] = True
 90 |     if str(asn) in asns or 'google' in website or 'facebook' in website or 'amazon' in website or 'microsoft' in website or 'azure' in website or 'akamai' in website or 'cdn' in website:
 91 |         res['cdn'] = True
 92 |         return res
 93 |     else:
 94 |         res['cdn'] = False
 95 | 
 96 |     if check_domain_name_ip(website, ip, protocol):
 97 |         res['header-test'] = True
 98 |         return res
 99 |     else:
100 |         res['header-test'] = False
101 |         return res
102 | 
103 | 
104 | def get_one_website_ip(domain, protocol, lat, lon):
105 |     ip_info = check_and_get_website_ip(domain, protocol)
106 |     ip_info['domain'] = domain
107 |     ip_info['protocol'] = protocol
108 |     ip_info['lat'] = lat
109 |     ip_info['lon'] = lon
110 |     return ip_info
111 | 
112 | 
113 | def get_landmarks_with_website_from_lat_lon(lat_arg, lon_arg):
114 |     # api = overpy.Overpass()
115 |     # api = overpy.Overpass(url="https://overpass.kumi.systems/api/interpreter")
116 |     api = overpy.Overpass(
117 |         url="https://maps.mail.ru/osm/tools/overpass/api/interpreter")
118 |     bbox = get_bounding_box(lat_arg, lon_arg)
119 |     query = f"""
120 |         [out:json];
121 |         (
122 |         node ({bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]})
123 |             [website];
124 |         way ({bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]})
125 |             [website];
126 |         );
127 |         out;
128 |     """
129 |     result = api.query(query)
130 |     res = []
131 |     for node in result.nodes:
132 |         lat = float(node.lat)
133 |         lon = float(node.lon)
134 |         tags = node.tags
135 |         website = tags['website']
136 |         res.append((website, lat, lon))
137 |     for way in result.ways:
138 |         try:
139 |             tmp_lat = 0
140 |             tmp_lon = 0
141 |             nodes = way.get_nodes(resolve_missing=True)
142 |             for node in nodes:
143 |                 tmp_lat += float(node.lat)
144 |                 tmp_lon += float(node.lon)
145 |             lat = tmp_lat/len(nodes)
146 |             lon = tmp_lon/len(nodes)
147 |             tags = way.tags
148 |             website = tags['website']
149 |             res.append((website, lat, lon))
150 |         except:
151 |             continue
152 |     return res
153 | 
154 | 
155 | def get_all_landmarks_and_stats_from_points(points):
156 |     dict_website = {}
157 |     with Pool(8) as pool:
158 |         results = pool.starmap(get_landmarks_with_website_from_lat_lon, points)
159 |         for result in results:
160 |             if result != None and result != []:
161 |                 for elem in result:
162 |                     dict_website[elem[0]] = elem
163 | 
164 |     unique_website = {}
165 |     for url in dict_website:
166 |         if "://" in url:
167 |             protocol = url.split("://")[0]
168 |             domain_name = url.split("://")[1]
169 |         else:
170 |             protocol = "http"
171 |             domain_name = url
172 |         website = domain_name.split("/")[0]
173 |         if (website, protocol) not in unique_website:
174 |             unique_website[(website, protocol)] = dict_website[url]
175 | 
176 |     args = []
177 |     failed_dns_count = 0
178 |     failed_asn_count = 0
179 |     cdn_count = 0
180 |     failed_header_test_count = 0
181 |     landmarks = []
182 | 
183 |     try:
184 |         all_websites = load_json(CACHED_WEBSITES_FILE)
185 |     except FileNotFoundError:
186 |         all_websites = {}
187 | 
188 |     for k, v in unique_website.items():
189 |         # fix websites
190 |         if 'google' in k or 'facebook' in k or 'amazon' in k or 'microsoft' in k or 'azure' in k or 'akamai' in k or 'cdn' in k:
191 |             all_websites[k]['cdn'] = True
192 | 
193 |         if k[0] not in all_websites:
194 |             args.append((k[0], k[1], v[1], v[2]))
195 |         else:
196 |             result = all_websites[k[0]]
197 |             if 'dns-failed' not in result or result['dns-failed']:
198 |                 failed_dns_count += 1
199 |                 continue
200 |             if 'asn-found' not in result or not result['asn-found']:
201 |                 failed_asn_count += 1
202 |                 continue
203 |             if 'cdn' not in result or result['cdn']:
204 |                 cdn_count += 1
205 |                 continue
206 |             if 'header-test' not in result or not result['header-test']:
207 |                 failed_header_test_count += 1
208 |                 continue
209 |             landmarks.append(
210 |                 (result['ip'], result['domain'], result['lat'], result['lon']))
211 | 
212 |     with Pool() as pool:
213 |         results = pool.starmap(get_one_website_ip, args)
214 |         for result in results:
215 |             all_websites[result['domain']] = result
216 |             if 'dns-failed' not in result or result['dns-failed']:
217 |                 failed_dns_count += 1
218 |                 continue
219 |             if 'asn-found' not in result or not result['asn-found']:
220 |                 failed_asn_count += 1
221 |                 continue
222 |             if 'cdn' not in result or result['cdn']:
223 |                 cdn_count += 1
224 |                 continue
225 |             if 'header-test' not in result or not result['header-test']:
226 |                 failed_header_test_count += 1
227 |                 continue
228 |             landmarks.append(
229 |                 (result['ip'], result['domain'], result['lat'], result['lon']))
230 | 
231 |     dump_json(all_websites, CACHED_WEBSITES_FILE)
232 | 
233 |     return failed_dns_count, failed_asn_count, cdn_count, failed_header_test_count, landmarks
234 | 


--------------------------------------------------------------------------------
/scripts/street_level/three_tiers.py:
--------------------------------------------------------------------------------
  1 | # One function per tier of the street level method.
  2 | 
  3 | import time
  4 | 
  5 | from scripts.analysis.analysis import local_circle_preprocessing
  6 | from scripts.street_level.landmark import get_all_landmarks_and_stats_from_points
  7 | from scripts.utils.helpers import get_center_of_poly, get_points_in_poly
  8 | from scripts.street_level.traceroutes_results import (
  9 |     get_circles_to_target,
 10 |     start_and_get_traceroutes,
 11 | )
 12 | 
 13 | 
 14 | def tier_1(target_ip, res, vps=None):
 15 |     st = time.time()
 16 |     # Get all circles (from each VP to the target)
 17 |     all_circles = get_circles_to_target(target_ip, vps)
 18 | 
 19 |     # Try the recommended internet speed at first
 20 |     speed_threshold = 4 / 9
 21 |     imp_circles = local_circle_preprocessing(
 22 |         all_circles, speed_threshold=speed_threshold
 23 |     )
 24 |     lat, lon = get_center_of_poly(imp_circles, speed_threshold)
 25 | 
 26 |     # If there is no intersection polygone try a slower interent speed
 27 |     if lat == None or lon == None:
 28 |         speed_threshold = 2 / 3
 29 |         imp_circles = local_circle_preprocessing(
 30 |             all_circles, speed_threshold=speed_threshold
 31 |         )
 32 |         lat, lon = get_center_of_poly(imp_circles, speed_threshold)
 33 |     res["speed_threshold"] = speed_threshold
 34 |     res["tier1:lat"] = lat
 35 |     res["tier1:lon"] = lon
 36 |     res["vps"] = imp_circles
 37 |     et = time.time()
 38 |     # Saving the time needed to perform this step
 39 |     res["tier1:duration"] = et - st
 40 |     return res
 41 | 
 42 | 
 43 | def tier_2(target_ip, res, vps=None):
 44 |     st = time.time()
 45 |     tier2_points = get_points_in_poly(res["vps"], 36, 5, res["speed_threshold"])
 46 |     res["tier2:all_points_count"] = len(tier2_points)
 47 | 
 48 |     # We remove points further than 1000km from the estimated center of the polygone (in case the intersection area is too big)
 49 |     tier2_points = tier2_points[: 200 * 10 + 1]
 50 |     res["tier2:inspected_points_count"] = len(tier2_points)
 51 |     if len(tier2_points) == 0:
 52 |         res["tier2:lat"] = None
 53 |         res["tier2:lon"] = None
 54 |         et = time.time()
 55 |         res["tier2:duration"] = et - st
 56 |         return res
 57 | 
 58 |     (
 59 |         failed_dns_count,
 60 |         failed_asn_count,
 61 |         cdn_count,
 62 |         failed_header_test_count,
 63 |         landmarks,
 64 |     ) = get_all_landmarks_and_stats_from_points(tier2_points)
 65 |     # We save stats for possiblity of a website to be used as a landmark
 66 |     res["tier2:failed_dns_count"] = failed_dns_count
 67 |     res["tier2:failed_asn_count"] = failed_asn_count
 68 |     res["tier2:cdn_count"] = cdn_count
 69 |     res["tier2:non_cdn_count"] = len(landmarks) + failed_header_test_count
 70 |     res["tier2:landmark_count"] = len(landmarks)
 71 |     res["tier2:failed_header_test_count"] = failed_header_test_count
 72 |     res["tier2:landmarks"] = landmarks
 73 | 
 74 |     if len(res["tier2:landmarks"]) == 0:
 75 |         res["tier2:lat"] = None
 76 |         res["tier2:lon"] = None
 77 |         et = time.time()
 78 |         res["tier2:duration"] = et - st
 79 |         return res
 80 | 
 81 |     res["tier2:traceroutes"] = start_and_get_traceroutes(
 82 |         target_ip, res["vps"], res["tier2:landmarks"], vps
 83 |     )
 84 |     all_circles = []
 85 |     best_rtt = 5000
 86 |     res_lat = None
 87 |     res_lon = None
 88 |     for probe_ip, target_ip, landmark_ip, r1ip, rtt, lat, lon, traceroute_id in res[
 89 |         "tier2:traceroutes"
 90 |     ]:
 91 |         if rtt < 0:
 92 |             continue
 93 |         all_circles.append((lat, lon, rtt, None, None))
 94 |         if rtt < best_rtt:
 95 |             best_rtt = rtt
 96 |             res_lat = lat
 97 |             res_lon = lon
 98 | 
 99 |     # If there is no valid RTT then tier 2 has failed and we can not go further
100 |     if len(all_circles) == 0:
101 |         res["tier2:lat"] = None
102 |         res["tier2:lon"] = None
103 |         et = time.time()
104 |         res["tier2:duration"] = et - st
105 |         return res
106 | 
107 |     # If not, we use the smallest rtt landmark as
108 |     res["tier2:lat"] = res_lat
109 |     res["tier2:lon"] = res_lon
110 |     res["tier2:final_circles"] = all_circles
111 |     et = time.time()
112 |     res["tier2:duration"] = et - st
113 |     return res
114 | 
115 | 
116 | def tier_3(target_ip, res, vps=None):
117 |     st = time.time()
118 |     if "tier2:final_circles" not in res:
119 |         res["tier3:lat"] = None
120 |         res["tier3:lon"] = None
121 |         et = time.time()
122 |         res["tier3:duration"] = et - st
123 |         return res
124 | 
125 |     else:
126 |         all_circles = res["tier2:final_circles"]
127 | 
128 |     imp_circles = local_circle_preprocessing(
129 |         all_circles, speed_threshold=res["speed_threshold"]
130 |     )
131 |     tier3_points = get_points_in_poly(
132 |         imp_circles, 10, 1, res["speed_threshold"], res["vps"]
133 |     )
134 |     res["tier3:all_points_count"] = len(tier3_points)
135 | 
136 |     # We remove points/zipcodes further then 40Km away from the center of the polygone
137 |     tier3_points = tier3_points[: 40 * 36 + 1]
138 |     res["tier3:inspected_points_count"] = len(tier3_points)
139 |     if len(tier3_points) == 0:
140 |         res["tier3:lat"] = None
141 |         res["tier3:lon"] = None
142 |         et = time.time()
143 |         res["tier3:duration"] = et - st
144 |         return res
145 | 
146 |     (
147 |         failed_dns_count,
148 |         failed_asn_count,
149 |         cdn_count,
150 |         failed_header_test_count,
151 |         tmp_landmarks,
152 |     ) = get_all_landmarks_and_stats_from_points(tier3_points)
153 |     landmarks = []
154 |     for landmark in tmp_landmarks:
155 |         ip = landmark[0]
156 |         found = False
157 |         for t2_lm in res["tier2:landmarks"]:
158 |             if t2_lm[0] == ip:
159 |                 found = True
160 |                 break
161 |         if not found:
162 |             landmarks.append(landmark)
163 | 
164 |     res["tier3:failed_dns_count"] = failed_dns_count
165 |     res["tier3:failed_asn_count"] = failed_asn_count
166 |     res["tier3:cdn_count"] = cdn_count
167 |     res["tier3:non_cdn_count"] = len(landmarks) + failed_header_test_count
168 |     res["tier3:landmark_count"] = len(landmarks)
169 |     res["tier3:failed_header_test_count"] = failed_header_test_count
170 |     res["tier3:landmarks"] = landmarks
171 | 
172 |     if len(res["tier3:landmarks"]) == 0:
173 |         res["tier3:lat"] = None
174 |         res["tier3:lon"] = None
175 |         et = time.time()
176 |         res["tier3:duration"] = et - st
177 |         return res
178 | 
179 |     res["tier3:traceroutes"] = start_and_get_traceroutes(
180 |         target_ip, res["vps"], res["tier3:landmarks"], vps
181 |     )
182 | 
183 |     best_lon = None
184 |     best_lat = None
185 |     best_rtt = 5000
186 |     for probe_ip, target_ip, landmark_ip, r1ip, rtt, lat, lon, traceroute_id in res[
187 |         "tier2:traceroutes"
188 |     ]:
189 |         if rtt < 0:
190 |             continue
191 |         if rtt < best_rtt:
192 |             best_rtt = rtt
193 |             best_lon = lon
194 |             best_lat = lat
195 |     for probe_ip, target_ip, landmark_ip, r1ip, rtt, lat, lon, traceroute_id in res[
196 |         "tier3:traceroutes"
197 |     ]:
198 |         if rtt < 0:
199 |             continue
200 |         if rtt < best_rtt:
201 |             best_rtt = rtt
202 |             best_lon = lon
203 |             best_lat = lat
204 | 
205 |     res["tier3:lat"] = best_lat
206 |     res["tier3:lon"] = best_lon
207 |     et = time.time()
208 |     res["tier3:duration"] = et - st
209 |     return res
210 | 
211 | 
212 | def get_all_info_geoloc(target_ip, vps=None):
213 |     # Init results
214 |     res = {
215 |         "target_ip": target_ip,
216 |         "tier1:done": False,
217 |         "tier2:done": False,
218 |         "tier3:done": False,
219 |         "negative_rtt_included": True,
220 |     }
221 |     res = tier_1(target_ip, res, vps=vps)
222 | 
223 |     # Using tier 1(CBG) results as geolocation if the other steps fail
224 |     res["lat"] = res["tier1:lat"]
225 |     res["lon"] = res["tier1:lon"]
226 |     if res["tier1:lat"] == None or res["tier1:lon"] == None:
227 |         return res
228 |     res["tier1:done"] = True
229 | 
230 |     res = tier_2(target_ip, res, vps=vps)
231 | 
232 |     # Using tier 2 resultsas geolocation if the last step fails
233 |     if res["tier2:lat"] == None or res["tier2:lon"] == None:
234 |         return res
235 |     else:
236 |         res["tier2:done"] = True
237 |         res["lat"] = res["tier2:lat"]
238 |         res["lon"] = res["tier2:lon"]
239 | 
240 |     res = tier_3(target_ip, res, vps=vps)
241 | 
242 |     if res["tier3:lat"] != None and res["tier3:lon"] != None:
243 |         res["tier3:done"] = True
244 |         res["lat"] = res["tier3:lat"]
245 |         res["lon"] = res["tier3:lon"]
246 | 
247 |     return res
248 | 
249 | 
250 | def geoloc(target_ip):
251 |     """
252 |     This function return a dict containint the lat, lon coordinates of the given target_ip.
253 |     The target_ip should be traceroutable.
254 |     The function gives a less informative gelocation result than get_all_info_geoloc
255 |     """
256 |     all_info = get_all_info_geoloc(target_ip)
257 |     return {"lat": all_info["lat"], "lon": all_info["lon"]}
258 | 


--------------------------------------------------------------------------------
/scripts/street_level/traceroutes_results.py:
--------------------------------------------------------------------------------
  1 | """Intermediate functions during street level traceroutes process"""
  2 | 
  3 | import time
  4 | 
  5 | from scripts.utils.clickhouse import Clickhouse
  6 | from scripts.utils.file_utils import load_json
  7 | from scripts.ripe_atlas.ping_and_traceroute_classes import TRACEROUTE
  8 | from scripts.ripe_atlas.atlas_api import fetch_traceroutes_from_measurement_ids_no_csv
  9 | from default import USER_ANCHORS_FILE, STREET_LEVEL_TRACEROUTES_TABLE
 10 | 
 11 | 
 12 | def start_traceroutes_to_targets(targets, probes):
 13 |     results_to_get = []
 14 |     for target in targets:
 15 |         target_ip = target[0]
 16 |         for probe in probes:
 17 |             probe_ip = probe["address_v4"]
 18 |             probe_id = str(probe["id"])
 19 |             trace = TRACEROUTE()
 20 |             res = trace.traceroute(target_ip, probe_id)
 21 |             if res != None:
 22 |                 results_to_get.append((res, probe_ip, target_ip))
 23 |     return results_to_get
 24 | 
 25 | 
 26 | def get_traceroutes_results(traceroute_ids):
 27 |     next_to_do = []
 28 |     for id in traceroute_ids:
 29 |         next_to_do.append(id)
 30 |     nb_tries = 20
 31 |     while nb_tries > 0 and len(next_to_do) > 0:
 32 |         nb_tries -= 1
 33 |         to_do = []
 34 |         for id in next_to_do:
 35 |             to_do.append(id)
 36 | 
 37 |         next_to_do = []
 38 | 
 39 |         for id in to_do:
 40 |             try:
 41 |                 ids = [id]
 42 |                 traceroute_data = fetch_traceroutes_from_measurement_ids_no_csv(ids)
 43 |                 if len(traceroute_data) == 0:
 44 |                     next_to_do.append(id)
 45 |                 else:
 46 |                     insert_lst = []
 47 |                     for t in traceroute_data:
 48 |                         ts = t.split(",")
 49 |                         insert_lst.append(
 50 |                             (
 51 |                                 ts[0],
 52 |                                 ts[1],
 53 |                                 ts[2],
 54 |                                 ts[3],
 55 |                                 int(ts[4]),
 56 |                                 int(ts[5]),
 57 |                                 float(ts[6]),
 58 |                                 int(ts[7]),
 59 |                                 int(ts[8]),
 60 |                                 int(ts[9]),
 61 |                                 int(ts[10]),
 62 |                             )
 63 |                         )
 64 |                     # We insert traceroute data into the database to be used later
 65 |                     clickhouse_driver = Clickhouse()
 66 |                     query = clickhouse_driver.insert_street_lvl_traceroutes_query(STREET_LEVEL_TRACEROUTES_TABLE)
 67 |                     clickhouse_driver.execute(query, insert_lst)
 68 |             except Exception:
 69 |                 next_to_do.append(id)
 70 |         if len(next_to_do) > 0:
 71 |             # We wait to try again
 72 |             time.sleep(15)
 73 | 
 74 | 
 75 | """
 76 |     Function starts and fetches traceroute from all probes to all targets
 77 | """
 78 | 
 79 | 
 80 | def multi_traceroutes(targets, probes):
 81 |     tmp_res_traceroutes = start_traceroutes_to_targets(targets, probes)
 82 |     traceroute_ids = []
 83 |     for elem in tmp_res_traceroutes:
 84 |         traceroute_ids.append(elem[0])
 85 | 
 86 |     get_traceroutes_results(traceroute_ids)
 87 |     return tmp_res_traceroutes
 88 | 
 89 | 
 90 | def tier_1_performe_traceroutes(target_ip, vps=None):
 91 |     # Traceroute from every VP to the target
 92 |     if vps == None:
 93 |         probes = load_json(USER_ANCHORS_FILE)
 94 |     else:
 95 |         probes = vps
 96 |     multi_traceroutes([[target_ip]], probes)
 97 | 
 98 | 
 99 | def get_circles_to_target(target_ip, vps=None):
100 |     # Get Rtts from all VPs to the targets if traceroutes are already done
101 |     clickhouse_driver = Clickhouse()
102 |     query = clickhouse_driver.get_all_rtt_to_dst_address_query(STREET_LEVEL_TRACEROUTES_TABLE, target_ip)
103 |     res = clickhouse_driver.execute(query)
104 |     # If None we need to lunch traceroutes from every VP to the target
105 |     if len(res) == 0:
106 |         tier_1_performe_traceroutes(target_ip, vps)
107 |         res = clickhouse_driver.execute(query)
108 |         if len(res) == 0:
109 |             return []
110 | 
111 |     # Calculate per VP min RTT
112 |     dict_rtt = {}
113 |     for hop in res:
114 |         if hop[0] not in dict_rtt:
115 |             dict_rtt[hop[0]] = (hop[1], hop[2])
116 |         if hop[2] > dict_rtt[hop[0]][1]:
117 |             dict_rtt[hop[0]] = (hop[1], hop[2])
118 |         if hop[2] == dict_rtt[hop[0]][1] and hop[1] < dict_rtt[hop[0]][0]:
119 |             dict_rtt[hop[0]] = (hop[1], hop[2])
120 | 
121 |     # From IPs get Geolocation given by RIPE Atlas
122 |     if vps == None:
123 |         probes_data = load_json(USER_ANCHORS_FILE)
124 |     else:
125 |         probes_data = vps
126 |     dict_probe_info = {}
127 |     for probe in probes_data:
128 |         if probe["address_v4"] == target_ip:
129 |             continue
130 |         if "address_v4" not in probe or probe["address_v4"] not in dict_rtt:
131 |             continue
132 |         if (
133 |             "geometry" not in probe
134 |             or "type" not in probe["geometry"]
135 |             or probe["geometry"]["type"] != "Point"
136 |             or "coordinates" not in probe["geometry"]
137 |         ):
138 |             continue
139 |         lon, lat = probe["geometry"]["coordinates"]
140 |         dict_probe_info[probe["address_v4"]] = (
141 |             lat,
142 |             lon,
143 |             dict_rtt[probe["address_v4"]][0],
144 |             None,
145 |             None,
146 |         )
147 | 
148 |     # Return a list of items
149 |     # each Item is a VP (lat, lon, min_rtt, dist = None, dist_r = None)
150 |     res = []
151 |     for k, v in dict_probe_info.items():
152 |         res.append(v)
153 |     return res
154 | 
155 | 
156 | def get_rtt_diff(probe_ip, target_ip, landmark_ip):
157 |     clickhouse_driver = Clickhouse()
158 |     query = clickhouse_driver.get_all_rtt_from_probe_to_targets_query(STREET_LEVEL_TRACEROUTES_TABLE, probe_ip, target_ip, landmark_ip)
159 |     res = clickhouse_driver.execute(query)
160 |     rtt_dict_target = {}
161 |     rtt_dict_landmark = {}
162 |     
163 |     for l in res:
164 |         resp_ip = l[0]
165 |         dst_ip = l[1]
166 |         rtt = l[2]
167 |         if dst_ip == target_ip:
168 |             if resp_ip not in rtt_dict_target:
169 |                 rtt_dict_target[resp_ip] = rtt
170 |             if rtt < rtt_dict_target[resp_ip]:
171 |                 rtt_dict_target[resp_ip] = rtt
172 |         elif dst_ip == landmark_ip:
173 |             if resp_ip not in rtt_dict_landmark:
174 |                 rtt_dict_landmark[resp_ip] = rtt
175 |             if rtt < rtt_dict_landmark[resp_ip]:
176 |                 rtt_dict_landmark[resp_ip] = rtt
177 |     if target_ip not in rtt_dict_target or landmark_ip not in rtt_dict_landmark:
178 |         return -1, None
179 |     target_rtt = rtt_dict_target[target_ip]
180 |     landmark_rtt = rtt_dict_landmark[landmark_ip]
181 |     same_dict = {}
182 |     for ip in rtt_dict_target:
183 |         if ip in rtt_dict_landmark:
184 |             same_dict[ip] = min(rtt_dict_landmark[ip], rtt_dict_target[ip])
185 |     best_rtt = 0
186 |     best_ip = None
187 |     for k, v in same_dict.items():
188 |         if v > best_rtt:
189 |             best_rtt = v
190 |             best_ip = k
191 |     return target_rtt + landmark_rtt - best_rtt - best_rtt, best_ip
192 | 
193 | 
194 | def get_probes_to_use_for_circles(circles, vps=None):
195 |     if vps == None:
196 |         probes_data = load_json(USER_ANCHORS_FILE)
197 |     else:
198 |         probes_data = vps
199 |     lats_lons = {}
200 |     for circle in circles:
201 |         lats_lons[(circle[0], circle[1])] = circle
202 |     res = []
203 |     for probe in probes_data:
204 |         if (
205 |             "geometry" not in probe
206 |             or "type" not in probe["geometry"]
207 |             or probe["geometry"]["type"] != "Point"
208 |             or "coordinates" not in probe["geometry"]
209 |         ):
210 |             continue
211 |         lon, lat = probe["geometry"]["coordinates"]
212 |         if (lat, lon) in lats_lons:
213 |             res.append(probe)
214 |     return res
215 | 
216 | 
217 | def start_and_get_traceroutes(target_ip, used_vps, landmarks, all_vps):
218 |     probes = get_probes_to_use_for_circles(used_vps, all_vps)
219 |     tmp_res_traceroutes = multi_traceroutes(landmarks, probes)
220 | 
221 |     # For each traceroute to a landmark we try to get the last common router/IP (r1ip) and the distance d1 + d2 (rtt)
222 |     res = []
223 |     for t in tmp_res_traceroutes:
224 |         traceroute_id = t[0]
225 |         probe_ip = t[1]
226 |         landmark_ip = t[2]
227 |         rtt, r1ip = get_rtt_diff(probe_ip, target_ip, landmark_ip)
228 |         for landmark in landmarks:
229 |             if landmark[0] == landmark_ip:
230 |                 res.append(
231 |                     (
232 |                         probe_ip,
233 |                         target_ip,
234 |                         landmark_ip,
235 |                         r1ip,
236 |                         rtt,
237 |                         landmark[2],
238 |                         landmark[3],
239 |                         traceroute_id,
240 |                     )
241 |                 )
242 |                 break
243 |     return res
244 | 
245 | 
246 | def serialize(res1):
247 |     res = {}
248 |     for k, v in res1.items():
249 |         res[k] = v
250 |     if "vps" in res:
251 |         tmp_lst = []
252 |         for x in res["vps"]:
253 |             tmp_lst.append(list(x))
254 |         res["vps"] = tmp_lst
255 |     if "tier2:landmarks" in res:
256 |         tmp_lst = []
257 |         for x in res["tier2:landmarks"]:
258 |             tmp_lst.append(list(x))
259 |         res["tier2:landmarks"] = tmp_lst
260 |     if "tier2:traceroutes" in res:
261 |         tmp_lst = []
262 |         for x in res["tier2:traceroutes"]:
263 |             tmp_lst.append(list(x))
264 |         res["tier2:traceroutes"] = tmp_lst
265 |     if "tier3:landmarks" in res:
266 |         tmp_lst = []
267 |         for x in res["tier3:landmarks"]:
268 |             tmp_lst.append(list(x))
269 |         res["tier3:landmarks"] = tmp_lst
270 |     if "tier3:traceroutes" in res:
271 |         tmp_lst = []
272 |         for x in res["tier3:traceroutes"]:
273 |             tmp_lst.append(list(x))
274 |         res["tier3:traceroutes"] = tmp_lst
275 |     return res
276 | 


--------------------------------------------------------------------------------
/scripts/utils/clickhouse.py:
--------------------------------------------------------------------------------
  1 | """clickhouse client"""
  2 | 
  3 | import subprocess
  4 | 
  5 | from pathlib import Path
  6 | from clickhouse_driver import Client
  7 | 
  8 | from logger import logger
  9 | from default import (
 10 |     CLICKHOUSE_HOST,
 11 |     CLICKHOUSE_DB,
 12 |     CLICKHOUSE_USER,
 13 |     CLICKHOUSE_PASSWORD,
 14 |     CLICKHOUSE_CLIENT,
 15 | )
 16 | 
 17 | 
 18 | class Clickhouse:
 19 |     def __init__(
 20 |         self,
 21 |         host: str = CLICKHOUSE_HOST,
 22 |         database: str = CLICKHOUSE_DB,
 23 |         user: str = CLICKHOUSE_USER,
 24 |         password: str = CLICKHOUSE_PASSWORD,
 25 |         client_path: Path = CLICKHOUSE_CLIENT,
 26 |     ) -> None:
 27 |         self.host = host
 28 |         self.database = database
 29 |         self.user = user
 30 |         self.password = password
 31 |         self.client_path = client_path
 32 | 
 33 |         self.client: Client = Client(
 34 |             host=self.host, user=self.user, password=self.password
 35 |         )
 36 | 
 37 |         self.settings = {"max_block_size": 100000}
 38 | 
 39 |     def get_min_rtt_per_src_dst_query(
 40 |         self, table: str, filter: str, threshold=10000
 41 |     ) -> str:
 42 |         return f"""
 43 |         WITH  arrayMin(groupArray(`min`)) as min_rtt
 44 |         SELECT IPv4NumToString(dst), IPv4NumToString(src), min_rtt
 45 |         FROM {self.database}.{table}
 46 |         WHERE `min` > -1 AND `min`< {threshold} AND dst != src {filter}
 47 |         GROUP BY (dst, src)
 48 |         """
 49 | 
 50 |     def get_min_rtt_per_src_dst_prefix_query(
 51 |         self, table: str, filter: str, threshold=10000
 52 |     ) -> str:
 53 |         return f"""
 54 |         WITH  arrayMin(groupArray(`min`)) as min_rtt
 55 |         SELECT IPv4NumToString(dst_prefix), IPv4NumToString(src), min_rtt
 56 |         FROM {self.database}.{table}
 57 |         WHERE `min` > -1 AND `min`< {threshold}
 58 |         AND dst_prefix != toIPv4(substring(cutIPv6(IPv4ToIPv6(src), 0, 1), 8))
 59 |         {filter}
 60 |         GROUP BY dst_prefix, src
 61 |         """
 62 | 
 63 |     def get_all_rtt_to_dst_address_query(self, table: str, target: str) -> str:
 64 |         return f"""
 65 |         SELECT src_addr, rtt, tstamp 
 66 |         FROM {self.database}.{table} 
 67 |         WHERE resp_addr = '{target}' AND dst_addr = '{target}'
 68 |         """
 69 | 
 70 |     def get_all_rtt_from_probe_to_targets_query(
 71 |         self, table: str, src: str, target1: str, target2: str
 72 |     ) -> str:
 73 |         return f"""
 74 |             SELECT resp_addr, dst_addr, rtt 
 75 |             FROM {self.database}.{table}  
 76 |             WHERE src_addr = '{src}' and (dst_addr =  '{target1}' or dst_addr = '{target2}')
 77 |         """
 78 | 
 79 |     def insert_street_lvl_traceroutes_query(self, table: str) -> str:
 80 |         return f"""
 81 |             INSERT 
 82 |             INTO {self.database}.{table} (
 83 |                 src_addr, dst_prefix, dst_addr, resp_addr, 
 84 |                 proto, hop, rtt, ttl, prb_id, msm_id, tstamp
 85 |             ) VALUES
 86 |         """
 87 | 
 88 |     def insert_native_query(self, table: str, infile_path: Path) -> str:
 89 |         """insert data using local clickhouse file"""
 90 |         return f"""
 91 |         INSERT INTO {self.database}.{table}
 92 |         FROM INFILE '{str(infile_path)}'
 93 |         FORMAT Native"""
 94 | 
 95 |     def insert_csv_query(self, table: str, infile_path: Path) -> str:
 96 |         """insert data from csv file"""
 97 |         return f"""
 98 |         INSERT INTO {self.database}.{table}
 99 |         FROM INFILE '{str(infile_path)}'
100 |         FORMAT CSV
101 |         """
102 | 
103 |     def insert_file(self, query: str) -> None:
104 |         """execute clickhouse insert query as not supported by clickhouse-driver"""
105 |         cmd = f"{str(self.client_path)} client"
106 | 
107 |         if self.password is not None and self.password != "":
108 |             cmd += f"--password={self.password}"
109 |         cmd += f' --query="{query}"'
110 | 
111 |         logger.info(f"executing query: {cmd}")
112 | 
113 |         ps = subprocess.run(cmd, shell=True, capture_output=True, text=True)
114 | 
115 |         if ps.stderr:
116 |             raise RuntimeError(
117 |                 f"Could not insert data::{cmd}, failed with error: {ps.stderr}"
118 |             )
119 |         else:
120 |             logger.info(f"{cmd}::Successfully executed")
121 | 
122 |     def execute(self, query: str, arg_lst=[]) -> None:
123 |         """execute query using clickhouse driver"""
124 |         if arg_lst == []:
125 |             return self.client.execute(query, settings=self.settings)
126 |         else:
127 |             return self.client.execute(query, arg_lst, settings=self.settings)
128 | 
129 |     def insert_from_values_query(self, table: str, values_description: str) -> str:
130 |         """insert data from csv file"""
131 |         return f"""
132 |         INSERT INTO {self.database}.{table}
133 |         ({values_description})
134 |         VALUES
135 |         """
136 | 
137 |     def insert_from_values(self, query: str, data: list) -> None:
138 |         return self.client.execute(query, data, settings=self.settings)
139 | 
140 |     def execute_iter(self, query: str) -> None:
141 |         """use clickhouse driver instead of subprocess"""
142 |         return self.client.execute_iter(query, settings=self.settings)
143 | 
144 |     def create_prefixes_ping_tables(self, table_name: str) -> str:
145 |         """create all ping tables"""
146 |         return f"""
147 |         CREATE TABLE IF NOT EXISTS {self.database}.{table_name} 
148 |         (
149 |         `src` IPv4,
150 |         `dst` IPv4,
151 |         `dst_prefix` IPv4 MATERIALIZED toIPv4(substring(cutIPv6(IPv4ToIPv6(dst), 0, 1), 8)),
152 |         `prb_id` UInt32,
153 |         `date` DateTime,
154 |         `sent` UInt32,
155 |         `rcvd` UInt32,
156 |         `rtts` Array(Float64),
157 |         `min` Float64,
158 |         `mean` Float64,
159 |         `msm_id` UInt64,
160 |         `proto` UInt8
161 |         ) 
162 |         ENGINE=MergeTree() 
163 |         ORDER BY (dst_prefix, dst, src, msm_id, date)
164 |         """
165 | 
166 |     def create_target_ping_tables(self, table_name: str) -> str:
167 |         """create table"""
168 |         return f"""
169 |         CREATE TABLE IF NOT EXISTS {self.database}.{table_name} 
170 |         (
171 |         `src` IPv4,
172 |         `dst` IPv4,
173 |         `prb_id` UInt32,
174 |         `date` DateTime,
175 |         `sent` UInt32,
176 |         `rcvd` UInt32,
177 |         `rtts` Array(Float64),
178 |         `min` Float64,
179 |         `mean` Float64,
180 |         `msm_id` UInt64,
181 |         `proto` UInt8
182 |         ) 
183 |         ENGINE=MergeTree() 
184 |         ORDER BY (dst, src, msm_id, date)
185 |         """
186 | 
187 |     def create_traceroutes_table(self, table_name: str) -> str:
188 |         return f"""
189 |         CREATE TABLE IF NOT EXISTS {self.database}.{table_name} 
190 |         (
191 |         `src_ip` String,
192 |         `dst_prefix` String,
193 |         `dst_ip` String,
194 |         `reply_ip` String,
195 |         `proto` Int16,
196 |         `hop` Int16,
197 |         `rtt` Float64,
198 |         `ttl` Int16,
199 |         `prb_id` Int64,
200 |         `msm_id` Int64,
201 |         `timestamp` DateTime('UTC')
202 |         ) 
203 |         ENGINE=MergeTree() 
204 |         ORDER BY (dst_prefix, dst_ip, src_ip, reply_ip)
205 |         """
206 | 
207 |     def create_street_level_table(self, table_name: str) -> str:
208 |         """create the street level traceroute table"""
209 | 
210 |         return f"""
211 |         CREATE TABLE IF NOT EXISTS {self.database}.{table_name} 
212 |         (
213 |         `src_addr` String, 
214 |         `dst_prefix` String, 
215 |         `dst_addr` String, 
216 |         `resp_addr` String, 
217 |         `proto` Int16, 
218 |         `hop` Int16, 
219 |         `rtt` Float64, 
220 |         `ttl` Int16, 
221 |         `prb_id` Int64, 
222 |         `msm_id` Int64,
223 |         `tstamp` Datetime('UTC')
224 |         )
225 |         ENGINE = MergeTree()
226 |         ORDER BY (dst_addr, src_addr, tstamp)
227 |         """
228 | 


--------------------------------------------------------------------------------
/scripts/utils/clickhouse_installer.py:
--------------------------------------------------------------------------------
  1 | """clickhouse client"""
  2 | 
  3 | from scripts.utils.clickhouse import Clickhouse
  4 | from logger import logger
  5 | 
  6 | from default import *
  7 | 
  8 | 
  9 | if __name__ == "__main__":
 10 |     clickhouse_driver = Clickhouse()
 11 | 
 12 |     ##################################################################################################
 13 |     # CREATE REPRO TABLES                                                                            #
 14 |     ##################################################################################################
 15 | 
 16 |     # create anchors_meshed_table
 17 |     query = clickhouse_driver.create_target_ping_tables(ANCHORS_MESHED_PING_TABLE)
 18 |     clickhouse_driver.execute(query)
 19 |     logger.info(f"table {ANCHORS_MESHED_PING_TABLE} created")
 20 | 
 21 |     query = clickhouse_driver.create_target_ping_tables(PROBES_TO_ANCHORS_PING_TABLE)
 22 |     clickhouse_driver.execute(query)
 23 |     logger.info(f"table {PROBES_TO_ANCHORS_PING_TABLE} created")
 24 | 
 25 |     # create prefixes ping table
 26 |     query = clickhouse_driver.create_prefixes_ping_tables(ANCHORS_TO_PREFIX_TABLE)
 27 |     clickhouse_driver.execute(query)
 28 |     logger.info(f"table {ANCHORS_TO_PREFIX_TABLE} created")
 29 | 
 30 |     query = clickhouse_driver.create_prefixes_ping_tables(PROBES_TO_PREFIX_TABLE)
 31 |     clickhouse_driver.execute(query)
 32 |     logger.info(f"table {PROBES_TO_PREFIX_TABLE} created")
 33 | 
 34 |     query = clickhouse_driver.create_prefixes_ping_tables(
 35 |         TARGET_TO_LANDMARKS_PING_TABLE
 36 |     )
 37 |     clickhouse_driver.execute(query)
 38 |     logger.info(f"table {TARGET_TO_LANDMARKS_PING_TABLE} created")
 39 | 
 40 |     # create traceroute table
 41 |     query = clickhouse_driver.create_traceroutes_table(ANCHORS_MESHED_TRACEROUTE_TABLE)
 42 |     clickhouse_driver.execute(query)
 43 |     logger.info(f"table {ANCHORS_MESHED_TRACEROUTE_TABLE} created")
 44 | 
 45 |     # Create street level db
 46 |     query = clickhouse_driver.create_street_level_table(STREET_LEVEL_TRACEROUTES_TABLE)
 47 |     clickhouse_driver.execute(query)
 48 |     logger.info(f"table {STREET_LEVEL_TRACEROUTES_TABLE} created")
 49 | 
 50 |     ##################################################################################################
 51 |     # INSERT REPRO DATA                                                                              #
 52 |     ##################################################################################################
 53 | 
 54 |     # table names
 55 |     tables = [
 56 |         ANCHORS_MESHED_TRACEROUTE_TABLE,
 57 |         PROBES_TO_ANCHORS_PING_TABLE,
 58 |         ANCHORS_TO_PREFIX_TABLE,
 59 |         PROBES_TO_PREFIX_TABLE,
 60 |         ANCHORS_MESHED_PING_TABLE,
 61 |         TARGET_TO_LANDMARKS_PING_TABLE,
 62 |         STREET_LEVEL_TRACEROUTES_TABLE,
 63 |     ]
 64 | 
 65 |     # measurements files_path
 66 |     file_paths = [
 67 |         ANCHORS_MESHED_TRACEROUTE_FILE,
 68 |         PROBES_TO_ANCHORS_PING_FILE,
 69 |         ANCHORS_TO_PREFIX_FILE,
 70 |         PROBES_TO_PREFIX_FILE,
 71 |         ANCHORS_MESHED_PING_FILE,
 72 |         TARGET_TO_LANDMARKS_PING_FILE,
 73 |         STREET_LEVEL_TRACEROUTES_FILE,
 74 |     ]
 75 | 
 76 |     for table_name, file_path in zip(tables, file_paths):
 77 |         logger.info(f"inserting data into {table_name} from {file_path}")
 78 |         insert_query = clickhouse_driver.insert_native_query(table_name, file_path)
 79 | 
 80 |         clickhouse_driver.insert_file(insert_query)
 81 | 
 82 |     ##################################################################################################
 83 |     # CREATE USER MEASUREMENT TABLES                                                                 #
 84 |     ##################################################################################################
 85 | 
 86 |     query = clickhouse_driver.create_target_ping_tables(USER_VPS_TO_TARGET_TABLE)
 87 |     clickhouse_driver.execute(query)
 88 |     logger.info(f"table {USER_VPS_TO_TARGET_TABLE} created")
 89 | 
 90 |     query = clickhouse_driver.create_target_ping_tables(USER_MESHED_TABLE)
 91 |     clickhouse_driver.execute(query)
 92 |     logger.info(f"table {USER_MESHED_TABLE} created")
 93 | 
 94 |     # create prefixes ping table
 95 |     query = clickhouse_driver.create_prefixes_ping_tables(USER_VPS_TO_PREFIX_TABLE)
 96 |     clickhouse_driver.execute(query)
 97 |     logger.info(f"table {USER_VPS_TO_PREFIX_TABLE} created")
 98 | 
 99 |     query = clickhouse_driver.create_prefixes_ping_tables(
100 |         USER_TARGET_TO_LANDMARKS_PING_TABLE
101 |     )
102 |     clickhouse_driver.execute(query)
103 |     logger.info(f"table {USER_TARGET_TO_LANDMARKS_PING_TABLE} created")
104 | 
105 |     # create traceroute table
106 |     query = clickhouse_driver.create_traceroutes_table(
107 |         USER_ANCHORS_MESHED_TRACEROUTE_TABLE
108 |     )
109 |     clickhouse_driver.execute(query)
110 |     logger.info(f"table {USER_ANCHORS_MESHED_TRACEROUTE_TABLE} created")
111 | 
112 |     # Create street level db
113 |     query = clickhouse_driver.create_street_level_table(
114 |         USER_STREET_LEVEL_TRACEROUTES_TABLE
115 |     )
116 |     clickhouse_driver.execute(query)
117 |     logger.info(f"table {USER_STREET_LEVEL_TRACEROUTES_TABLE} created")
118 | 


--------------------------------------------------------------------------------
/scripts/utils/credentials.py:
--------------------------------------------------------------------------------
 1 | """get all credentials (Clickhouse and RIPE)"""
 2 | 
 3 | import json
 4 | import os
 5 | 
 6 | from logger import logger
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | def get_clickhouse_credentials() -> dict:
13 |     """return clickhouse credentials"""
14 | 
15 |     # try to get credentials with env var directly
16 |     try:
17 |         return {
18 |             "base_url": os.environ["CLICKHOUSE_BASE_URL"],
19 |             "user": os.environ["CLICKHOUSE_USER"],
20 |             "password": os.environ["CLICKHOUSE_PASSWORD"],
21 |         }
22 | 
23 |     except KeyError as e:
24 |         logger.error(
25 |             f"Missing credentials for interacting with IRIS API (set: CLICKHOUSE_BASE_URL | CLICKHOUSE_USERNAME | CLICKHOUSE_PASSWORD): {e}"
26 |         )
27 | 
28 | 
29 | def get_ripe_atlas_credentials() -> dict:
30 |     """return ripe credentials"""
31 |     try:
32 |         return {
33 |             "username": os.environ["RIPE_USERNAME"],
34 |             "secret_key": os.environ["RIPE_SECRET_KEY"],
35 |         }
36 | 
37 |     except KeyError as e:
38 |         logger.error(
39 |             f"Missing credentials for interacting with IRIS API (set: CLICKHOUSE_BASE_URL | CLICKHOUSE_USERNAME | CLICKHOUSE_PASSWORD): {e}"
40 |         )
41 | 


--------------------------------------------------------------------------------
/scripts/utils/file_utils.py:
--------------------------------------------------------------------------------
 1 | """Functions to load and save data into a json format.
 2 | All the paths are given in default.py file.
 3 | """
 4 | import ujson as json
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | def load_json(file_path: Path):
10 |     # check that dirs exits
11 |     if not file_path.parent.exists():
12 |         file_path.parent.mkdir(parents=True, exist_ok=True)
13 | 
14 |     with open(file_path) as f:
15 |         return json.load(f)
16 | 
17 | 
18 | def dump_json(data, file_path: Path):
19 |     """dump data to output file"""
20 |     # check that dirs exits
21 |     if not file_path.parent.exists():
22 |         file_path.parent.mkdir(parents=True, exist_ok=True)
23 | 
24 |     with open(file_path, "w") as f:
25 |         json.dump(data, f, indent=4)
26 | 
27 | 
28 | # def append_results(data, file_path: Paths)
29 | 


--------------------------------------------------------------------------------
/scripts/utils/helpers.py:
--------------------------------------------------------------------------------
  1 | # Mathematical functions helpful for geolocation problems
  2 | 
  3 | import itertools
  4 | import numpy as np
  5 | 
  6 | from math import asin, cos, log, radians, sin, sqrt, pi
  7 | 
  8 | 
  9 | def internet_speed(rtt, speed_threshold):
 10 |     if speed_threshold is not None:
 11 |         return speed_threshold
 12 | 
 13 |     if rtt >= 80:
 14 |         speed_threshold = 4 / 9
 15 |     if rtt >= 5 and rtt < 80:
 16 |         speed_threshold = 3 / 9
 17 |     if rtt >= 0 and rtt < 5:
 18 |         speed_threshold = 1 / 6
 19 | 
 20 |     return speed_threshold
 21 | 
 22 | 
 23 | def rtt_to_km(rtt, speed_threshold=None, c=300):
 24 |     return internet_speed(rtt, speed_threshold) * rtt * c / 2
 25 | 
 26 | 
 27 | def is_within_cirle(vp_geo, rtt, candidate_geo, speed_threshold=None):
 28 |     d = rtt_to_km(rtt, speed_threshold)
 29 |     d_vp_candidate = haversine(vp_geo, candidate_geo)
 30 |     if d < d_vp_candidate:
 31 |         return False
 32 |     else:
 33 |         return True
 34 | 
 35 | 
 36 | def geo_to_cartesian(lat, lon):
 37 |     lat *= np.pi / 180
 38 |     lon *= np.pi / 180
 39 | 
 40 |     x = np.cos(lon) * np.cos(lat)
 41 |     y = np.sin(lon) * np.cos(lat)
 42 |     z = np.sin(lat)
 43 | 
 44 |     return x, y, z
 45 | 
 46 | 
 47 | def check_circle_inclusion(c_1, c_2):
 48 |     lat_1, lon_1, rtt_1, d_1, r_1 = c_1
 49 |     lat_2, lon_2, rtt_2, d_2, r_2 = c_2
 50 |     d = haversine((lat_1, lon_1), (lat_2, lon_2))
 51 |     if d_1 > (d + d_2):
 52 |         return c_1, c_2
 53 |     elif d_2 > (d + d_1):
 54 |         return c_2, c_1
 55 |     return None, None
 56 | 
 57 | 
 58 | def circle_preprocessing(circles, speed_threshold=None):
 59 |     circles_to_ignore = set()
 60 | 
 61 |     circles_with_r_info = []
 62 |     for c in circles:
 63 |         lat, lon, rtt, d, r = c
 64 |         if d is None:
 65 |             d = rtt_to_km(rtt, speed_threshold)
 66 |         if r is None:
 67 |             r = d / 6371
 68 |         circles_with_r_info.append((lat, lon, rtt, d, r))
 69 | 
 70 |     for i in range(len(circles_with_r_info)):
 71 |         c_1 = circles_with_r_info[i]
 72 |         if c_1 in circles_to_ignore:
 73 |             continue
 74 |         lat_1, lon_1, rtt_1, d_1, r_1 = c_1
 75 |         for j in range(i + 1, len(circles_with_r_info)):
 76 |             c_2 = circles_with_r_info[j]
 77 |             if c_2 in circles_to_ignore:
 78 |                 continue
 79 |             lat_2, lon_2, rtt_2, d_2, r_2 = c_2
 80 |             remove, keep = check_circle_inclusion(
 81 |                 (lat_1, lon_1, rtt_1, d_1, r_1), (lat_2, lon_2, rtt_2, d_2, r_2)
 82 |             )
 83 |             if remove:
 84 |                 circles_to_ignore.add(remove)
 85 | 
 86 |     circles_to_keep = set(circles_with_r_info) - circles_to_ignore
 87 | 
 88 |     return circles_to_keep
 89 | 
 90 | 
 91 | def get_points_on_circle(lat_c, lon_c, r_c, nb_points: int = 4):
 92 |     """from a circle, return a set of points"""
 93 |     circle_points = []
 94 |     for k in range(nb_points):
 95 |         # compute
 96 |         angle = pi * 2 * k / nb_points
 97 |         dx = r_c * 1000 * cos(angle)
 98 |         dy = r_c * 1000 * sin(angle)
 99 |         lat = lat_c + (180 / pi) * (dy / 6378137)
100 |         lon = lon_c + (180 / pi) * (dx / 6378137) / cos(lat_c * pi / 180)
101 | 
102 |         circle_points.append((lat, lon))
103 | 
104 |     return circle_points
105 | 
106 | 
107 | def circle_intersections(circles, speed_threshold=None):
108 |     """
109 |     Check out this link for more details about the maths:
110 |     https://gis.stackexchange.com/questions/48937/calculating-intersection-of-two-circles
111 |     """
112 |     intersect_points = []
113 | 
114 |     circles = circle_preprocessing(circles, speed_threshold=speed_threshold)
115 | 
116 |     if len(circles) == 1:
117 |         single_circle = list(circles)[0]
118 |         lat, lon, rtt, d, r = single_circle
119 |         filtered_points = get_points_on_circle(lat, lon, d)
120 |         return filtered_points, circles
121 | 
122 |     for c_1, c_2 in itertools.combinations(circles, 2):
123 |         lat_1, lon_1, rtt_1, d_1, r_1 = c_1
124 |         lat_2, lon_2, rtt_2, d_2, r_2 = c_2
125 | 
126 |         x1 = np.array(list(geo_to_cartesian(lat_1, lon_1)))
127 |         x2 = np.array(list(geo_to_cartesian(lat_2, lon_2)))
128 | 
129 |         q = np.dot(x1, x2)
130 | 
131 |         a = (np.cos(r_1) - np.cos(r_2) * q) / (1 - (q**2))
132 |         b = (np.cos(r_2) - np.cos(r_1) * q) / (1 - (q**2))
133 | 
134 |         x0 = a * x1 + b * x2
135 | 
136 |         n = np.cross(x1, x2)
137 |         if (1 - np.dot(x0, x0)) / np.dot(n, n) <= 0:
138 |             # print("ANYCAST???", (lat_1, lon_1, rtt_1, d_1), (lat_2, lon_2, rtt_2, d_2))
139 |             continue
140 | 
141 |         t = np.sqrt((1 - np.dot(x0, x0)) / np.dot(n, n))
142 | 
143 |         i1 = x0 + t * n
144 |         i2 = x0 - t * n
145 | 
146 |         i_lon_1 = np.arctan2(i1[1], i1[0]) * (180 / np.pi)
147 |         i_lat_1 = np.arctan(i1[2] / np.sqrt((i1[0] ** 2) + (i1[1] ** 2))) / (
148 |             np.pi / 180
149 |         )
150 |         intersect_points.append((i_lat_1, i_lon_1))
151 | 
152 |         i_lon_2 = np.arctan2(i2[1], i2[0]) * (180 / np.pi)
153 |         i_lat_2 = np.arctan(i2[2] / np.sqrt((i2[0] ** 2) + (i2[1] ** 2))) / (
154 |             np.pi / 180
155 |         )
156 |         intersect_points.append((i_lat_2, i_lon_2))
157 | 
158 |     filtred_points = []
159 |     for point_geo in intersect_points:
160 |         for lat_c, long_c, rtt_c, d_c, r_c in circles:
161 |             if not is_within_cirle((lat_c, long_c), rtt_c, point_geo, speed_threshold):
162 |                 break
163 |         else:
164 |             filtred_points.append(point_geo)
165 | 
166 |     return filtred_points, circles
167 | 
168 | 
169 | def polygon_centroid(points):
170 |     """
171 |     Compute polygon centroid using Finit Set of point method.
172 |     (see https://en.wikipedia.org/wiki/Centroid#Of_a_finite_set_of_points)
173 |     """
174 |     x = 0
175 |     y = 0
176 |     for point in points:
177 |         x += point[0]
178 |         y += point[1]
179 |     return x / len(points), y / len(points)
180 | 
181 | 
182 | def haversine(input_location, block_location):
183 |     """Distance between two locations in earth."""
184 |     in_lat, in_lon, block_lat, block_lon = map(
185 |         np.radians, [*input_location, *block_location]
186 |     )
187 | 
188 |     dlat = block_lat - in_lat
189 |     dlon = block_lon - in_lon
190 | 
191 |     distances = (
192 |         np.sin(dlat / 2.0) ** 2
193 |         + np.cos(in_lat) * np.cos(block_lat) * np.sin(dlon / 2.0) ** 2
194 |     )
195 | 
196 |     return 6367 * 2 * np.arcsin(np.sqrt(distances))
197 | 
198 | 
199 | def distance(lat1, lat2, lon1, lon2):
200 |     lon1 = radians(lon1)
201 |     lon2 = radians(lon2)
202 |     lat1 = radians(lat1)
203 |     lat2 = radians(lat2)
204 | 
205 |     # Haversine formula
206 |     dlon = lon2 - lon1
207 |     dlat = lat2 - lat1
208 |     a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
209 | 
210 |     c = 2 * asin(sqrt(a))
211 | 
212 |     r = 6371
213 | 
214 |     return c * r
215 | 
216 | 
217 | def get_middle_intersection(intersections):
218 |     """in case of only two intersection points, return the middle segment"""
219 |     (lat1, lon1) = intersections[0]
220 |     (lat2, lon2) = intersections[1]
221 | 
222 |     # convert to radians
223 |     lon1 = radians(lon1)
224 |     lon2 = radians(lon2)
225 |     lat1 = radians(lat1)
226 |     lat2 = radians(lat2)
227 | 
228 |     # calculate the middle of two points
229 |     Bx = np.cos(lat2) * np.cos(lon2 - lon1)
230 |     By = np.cos(lat2) * np.sin(lon2 - lon1)
231 |     latMid = np.arctan2(
232 |         np.sin(lat1) + np.sin(lat2),
233 |         np.sqrt((np.cos(lat1) + Bx) * (np.cos(lat1) + Bx) + By * By),
234 |     )
235 |     lonMid = lon1 + np.arctan2(By, np.cos(lat1) + Bx)
236 | 
237 |     # convert back to degrees
238 |     latMid = latMid * (180 / pi)
239 |     lonMid = lonMid * (180 / pi)
240 | 
241 |     return latMid, lonMid
242 | 
243 | 
244 | def select_best_guess_centroid(target_ip, vp_coordinates_per_ip, rtt_per_vp_to_target):
245 |     """
246 |     Find the best guess
247 |     that is the location of the vantage point closest to the centroid.
248 |     """
249 |     probe_circles = {}
250 |     closest_vp = None
251 |     min_rtt_per_vp_ip = {}
252 |     for vp_ip, rtts in rtt_per_vp_to_target.items():
253 |         if target_ip == vp_ip:
254 |             continue
255 |         if vp_ip not in vp_coordinates_per_ip:
256 |             continue
257 |         lat, lon = vp_coordinates_per_ip[vp_ip]
258 |         min_rtt = min(rtts)
259 |         if min_rtt > 100:
260 |             continue
261 |         min_rtt_per_vp_ip[vp_ip] = min_rtt
262 |         # too inflated RTT means that measurement will not provide useful info
263 | 
264 |         if isinstance(min_rtt, float):
265 |             probe_circles[vp_ip] = (
266 |                 lat,
267 |                 lon,
268 |                 min_rtt,
269 |                 None,
270 |                 None,
271 |             )
272 |             # print(f"vp_anchor = {vp_ip} with results: {min_rtt}")
273 |     # print()
274 | 
275 |     # draw circles
276 |     if not probe_circles:
277 |         return None
278 |     circles = list(probe_circles.values())
279 |     intersections, circles = circle_intersections(circles, speed_threshold=2/3)
280 |     if len(intersections) > 2:
281 |         centroid = polygon_centroid(intersections)
282 |     elif len(intersections) == 2:
283 |         # only two circles intersection, centroid is middle of the segment
284 |         centroid = get_middle_intersection(intersections)
285 |     else:
286 |         # only one circle so take the closest vp as the centroid
287 |         closest_vp, _ = min(min_rtt_per_vp_ip.items(), key=lambda x: x[1])
288 |         centroid = vp_coordinates_per_ip[closest_vp]
289 | 
290 |     return centroid, circles
291 | 
292 | 
293 | def get_center_of_poly(circles, speed):
294 |     points, circles = circle_intersections(circles, speed)
295 |     if len(points) == 0:
296 |         return None, None
297 |     return polygon_centroid(points)
298 | 
299 | 
300 | def get_points_in_poly(circles, rot, rad, speed, old_circles=[]):
301 |     circles = circle_preprocessing(circles, speed_threshold=speed)
302 |     points, circles = circle_intersections(circles, speed)
303 |     if len(points) == 0:
304 |         return []
305 |     else:
306 |         center = polygon_centroid(points)
307 |     res = [center]
308 |     iter_rad = 0
309 |     points_added = True
310 |     while points_added:
311 |         iter_rad += rad
312 |         points_added = False
313 |         to_add_points = get_points_on_circle(
314 |             center[0], center[1], iter_rad, int(360/rot))
315 |         for point in to_add_points:
316 |             all_in = True
317 |             for vp in circles:
318 |                 if not is_within_cirle((vp[0], vp[1]), vp[2], point, speed):
319 |                     all_in = False
320 |                     break
321 |             if all_in:
322 |                 for vp in old_circles:
323 |                     if not is_within_cirle((vp[0], vp[1]), vp[2], point, speed):
324 |                         all_in = False
325 |                     break
326 |                 if all_in:
327 |                     points_added = True
328 |                     res.append(point)
329 |     return res
330 | 
331 | def greedy_selection_probes_impl(probe, distance_per_probe, selected_probes):
332 | 
333 |     distances_log = [log(distance_per_probe[p]) for p in selected_probes
334 |                      if p in distance_per_probe and distance_per_probe[p] > 0]
335 |     total_distance = sum(distances_log)
336 |     return probe, total_distance
337 |     
338 | 


--------------------------------------------------------------------------------
/scripts/utils/measurement_utils.py:
--------------------------------------------------------------------------------
  1 | """functions for running measurements"""
  2 | 
  3 | import random
  4 | import time
  5 | 
  6 | from datetime import datetime
  7 | from uuid import UUID
  8 | from pathlib import Path
  9 | from dateutil import parser
 10 | 
 11 | from logger import logger
 12 | from scripts.utils.file_utils import load_json, dump_json
 13 | from scripts.ripe_atlas.atlas_api import get_prefix_from_ip, get_measurements_from_tag
 14 | from scripts.ripe_atlas.ping_and_traceroute_classes import PING
 15 | from scripts.utils.clickhouse import Clickhouse
 16 | 
 17 | from default import (
 18 |     PREFIX_MEASUREMENT_RESULTS,
 19 |     TARGET_MEASUREMENT_RESULTS,
 20 |     USER_VPS_TO_PREFIX_TABLE,
 21 |     USER_VPS_TO_TARGET_TABLE,
 22 | )
 23 | 
 24 | 
 25 | def load_targets(target_file_path: Path, nb_target: int = -1) -> list:
 26 |     """get a file as entry, return a list of ip target"""
 27 |     targets = load_json(target_file_path)
 28 | 
 29 |     if nb_target > len(targets) or nb_target < 0:
 30 |         nb_target = len(targets)
 31 | 
 32 |     subset_targets = random.sample(targets, k=nb_target)
 33 | 
 34 |     return subset_targets
 35 | 
 36 | 
 37 | def load_vps(vps_file_path: Path, nb_vps: int = -1) -> list:
 38 |     """load vps from file, return list of vps"""
 39 |     vps = load_json(vps_file_path)
 40 | 
 41 |     if nb_vps > len(vps) or nb_vps < 0:
 42 |         nb_vps = len(vps)
 43 | 
 44 |     subset_vps = random.sample(vps, k=nb_vps)
 45 | 
 46 |     return subset_vps
 47 | 
 48 | 
 49 | def get_measurement_config(
 50 |     experiment_uuid: UUID,
 51 |     prefix_measurement_uuid: UUID,
 52 |     target_measurement_uuid: UUID,
 53 |     targets: list,
 54 |     target_prefixes: list,
 55 |     vps: dict,
 56 |     dry_run=False,
 57 | ) -> dict:
 58 |     """return measurement config for future retrieval"""
 59 |     return {
 60 |         "experiment_uuid": str(experiment_uuid),
 61 |         "status": "ongoing",
 62 |         "start_time": str(datetime.now()),
 63 |         "end_time": None,
 64 |         "is_dry_run": dry_run,
 65 |         "nb_targets": len(targets),
 66 |         "nb_vps": len(vps),
 67 |         "description": "measurements from a set of vps towards all targets/target prefixes",
 68 |         "af": 4,
 69 |         "target_measurements": {
 70 |             "measurement_uuid": str(target_measurement_uuid),
 71 |             "targets": targets,
 72 |             "vps": vps,
 73 |             "end_time": None,
 74 |         },
 75 |         "prefix_measurements": {
 76 |             "measurement_uuid": str(prefix_measurement_uuid),
 77 |             "targets": target_prefixes,
 78 |             "vps": vps,
 79 |             "end_time": None,
 80 |         },
 81 |     }
 82 | 
 83 | 
 84 | def save_measurement_config(measurement_config: dict, out_path: Path) -> None:
 85 |     """save measurement config"""
 86 | 
 87 |     try:
 88 |         if (
 89 |             measurement_config["prefix_measurements"]["end_time"] is not None
 90 |             and measurement_config["target_measurements"]["end_time"] is not None
 91 |         ):
 92 |             measurement_config["end_time"] = str(datetime.now())
 93 |             measurement_config["status"] = "finished"
 94 |     except KeyError:
 95 |         pass
 96 | 
 97 |     dump_json(measurement_config, out_path)
 98 | 
 99 | 
100 | def get_target_prefixes(targets: list) -> list:
101 |     """from a set of targets ip addresses return their /24 prefixes"""
102 |     return [get_prefix_from_ip(target_addr) for target_addr in targets]
103 | 
104 | 
105 | def ping_prefixes(
106 |     measurement_uuid: UUID,
107 |     measurement_config: dict,
108 |     target_prefixes: list,
109 |     targets_per_prefix: dict[list],
110 |     vps: list[dict],
111 |     dry_run: bool = False,
112 |     use_cache: bool = True,
113 |     cache_file: Path = PREFIX_MEASUREMENT_RESULTS,
114 | ) -> None:
115 |     """ping all targets prefixes from all vps"""
116 | 
117 |     pinger = PING()
118 | 
119 |     try:
120 |         # load cached prefix results in case measurement was interrupted
121 |         if use_cache:
122 |             cached_results = load_json(cache_file)
123 | 
124 |             if cached_results:
125 |                 logger.info(
126 |                     f"initial length targets: {len(targets_per_prefix)}, cached measurements : {len(cached_results)}"
127 |                 )
128 | 
129 |                 # get prefixes out of targets
130 |                 cached_results = [
131 |                     get_prefix_from_ip(target["dst_addr"]) for target in cached_results
132 |                 ]
133 |                 for subnet in cached_results:
134 |                     if subnet not in targets_per_prefix:
135 |                         continue
136 |                     targets_per_prefix.pop(subnet)
137 | 
138 |                 logger.info(
139 |                     f"after removing cached: {len(targets_per_prefix)}, cached measurements : {len(cached_results)}"
140 |                 )
141 |     except FileNotFoundError:
142 |         logger.info("No cached results available")
143 |         pass
144 | 
145 |     logger.info(
146 |         f"Starting measurements {str(measurement_uuid)} with parameters: dry_run={dry_run}; nb_targets={len(target_prefixes)}; nb_vps={len(vps)}."
147 |     )
148 | 
149 |     # measurement for 3 targets in every target prefixes
150 |     ids, start_time, end_time = pinger.ping_by_prefix(
151 |         target_prefixes=target_prefixes,
152 |         vps=vps,
153 |         targets_per_prefix=targets_per_prefix,
154 |         tag=measurement_uuid,
155 |         dry_run=dry_run,
156 |     )
157 | 
158 |     # overwrite ids
159 |     if "ids" in measurement_config["prefix_measurements"]:
160 |         ids.extend(measurement_config["prefix_measurements"]["ids"])
161 | 
162 |     measurement_config["prefix_measurements"]["start_time"] = start_time
163 |     measurement_config["prefix_measurements"]["end_time"] = end_time
164 | 
165 | 
166 | def ping_targets(
167 |     measurement_uuid: UUID,
168 |     measurement_config: dict,
169 |     targets: list[dict],
170 |     vps: list[dict],
171 |     dry_run: bool = False,
172 |     use_cache: bool = True,
173 |     cache_file: Path = TARGET_MEASUREMENT_RESULTS,
174 | ) -> None:
175 |     """ping all targets using all vps"""
176 | 
177 |     pinger = PING()
178 | 
179 |     targets = [t["address_v4"] for t in targets]
180 | 
181 |     try:
182 |         if use_cache:
183 |             cached_results = load_json(cache_file)
184 |             logger.info(
185 |                 f"initial length targets: {len(targets)}, cached measurements : {len(cached_results)}"
186 |             )
187 | 
188 |             cached_results = [c["dst_addr"] for c in cached_results]
189 | 
190 |             targets = list(set(targets).difference(set(cached_results)))
191 | 
192 |             logger.info(
193 |                 f"after removing cached: {len(targets)}, cached measurements : {len(cached_results)}"
194 |             )
195 |     except FileNotFoundError:
196 |         logger.info("No cached results available")
197 |         pass
198 | 
199 |     logger.info(
200 |         f"Starting measurements {str(measurement_uuid)} with parameters: dry_run={dry_run}; nb_targets={len(targets)}; nb_vps={len(vps)}."
201 |     )
202 | 
203 |     ids, start_time, end_time = pinger.ping_by_target(
204 |         targets=targets, vps=vps, tag=measurement_uuid, dry_run=dry_run
205 |     )
206 | 
207 |     # overwrite ids
208 |     if "ids" in measurement_config["target_measurements"]:
209 |         ids.extend(measurement_config["target_measurements"]["ids"])
210 | 
211 |     measurement_config["target_measurements"]["start_time"] = start_time
212 |     measurement_config["target_measurements"]["end_time"] = end_time
213 | 
214 | 
215 | def get_latest_measurements(config_path: Path) -> dict:
216 |     """retrieve latest measurement config"""
217 |     try:
218 |         assert config_path.is_dir()
219 |     except AssertionError:
220 |         logger.error(f"config path is not a dir: {config_path}")
221 | 
222 |     latest: datetime = None
223 |     for file in config_path.iterdir():
224 |         measurement_config = load_json(file)
225 |         if latest:
226 |             if latest < parser.isoparse(measurement_config["start_time"]):
227 |                 latest_config = measurement_config
228 |         else:
229 |             latest = parser.isoparse(measurement_config["start_time"])
230 |             latest_config = measurement_config
231 | 
232 |     return latest_config
233 | 
234 | 
235 | def retrieve_results(
236 |     measurement_uuid: str,
237 |     out_file: Path,
238 | ) -> None:
239 |     """query RIPE Atlas API to retrieve all measurement results"""
240 |     # fetch results on API
241 |     measurement_results = get_measurements_from_tag(measurement_uuid)
242 | 
243 |     logger.info(
244 |         f"nb measurements retrieved: {len(measurement_results)} for measurement_uuid : {measurement_uuid}"
245 |     )
246 | 
247 |     # save results in cache file
248 |     dump_json(measurement_results, out_file)
249 | 
250 |     return measurement_results
251 | 
252 | 
253 | def insert_prefix_results(results: list) -> None:
254 |     """insert prefixes results with CSV value method"""
255 |     rows = []
256 |     values_description = (
257 |         "src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto"
258 |     )
259 | 
260 |     if not results:
261 |         raise RuntimeError(f"no data to insert, data = {result}")
262 | 
263 |     for result in results:
264 |         try:
265 |             # parse response
266 |             src = result["src_addr"]
267 |             dst = result["dst_addr"]
268 |             prb_id = result["prb_id"]
269 |             date = result["timestamp"]
270 |             sent = result["sent"]
271 |             rcvd = result["rcvd"]
272 |             rtts = (
273 |                 [rtt["rtt"] for rtt in result["result"]]
274 |                 if "rtt" in result["result"]
275 |                 else [-1]
276 |             )
277 |             min = result["min"]
278 |             mean = result["avg"]
279 |             msm_id = result["msm_id"]
280 |             proto = 0
281 | 
282 |             row = [src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto]
283 | 
284 |             rows.append(row)
285 |         except KeyError as e:
286 |             logger.warning(f"Some measurements does not contain results: {e}")
287 | 
288 |     clickhouse = Clickhouse()
289 |     query = clickhouse.insert_from_values_query(
290 |         USER_VPS_TO_PREFIX_TABLE, values_description
291 |     )
292 |     clickhouse.insert_from_values(query, rows)
293 | 
294 |     logger.info(
295 |         f"Prefix measurements successfully inserted in table : {USER_VPS_TO_PREFIX_TABLE}"
296 |     )
297 | 
298 | 
299 | def insert_target_results(results: list) -> None:
300 |     """insert prefixes results with CSV value method"""
301 |     rows = []
302 |     values_description = (
303 |         "src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto"
304 |     )
305 |     for result in results:
306 |         # parse response
307 |         src = result["src_addr"]
308 |         dst = result["dst_addr"]
309 |         prb_id = result["prb_id"]
310 |         date = result["timestamp"]
311 |         sent = result["sent"]
312 |         rcvd = result["rcvd"]
313 |         rtts = (
314 |             [rtt["rtt"] for rtt in result["result"]]
315 |             if "rtt" in result["result"]
316 |             else [-1]
317 |         )
318 |         min = result["min"]
319 |         mean = result["avg"]
320 |         msm_id = result["msm_id"]
321 |         proto = 0
322 | 
323 |         row = [src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto]
324 | 
325 |         rows.append(row)
326 | 
327 |     clickhouse = Clickhouse()
328 |     query = clickhouse.insert_from_values_query(
329 |         USER_VPS_TO_TARGET_TABLE, values_description
330 |     )
331 |     clickhouse.insert_from_values(query, rows)
332 | 
333 |     logger.info(
334 |         f"Target measurements successfully inserted in table : {USER_VPS_TO_TARGET_TABLE}"
335 |     )
336 | 


--------------------------------------------------------------------------------
/scripts/utils/plot_utils.py:
--------------------------------------------------------------------------------
  1 | """Functions to plot figures in a nice way"""
  2 | 
  3 | from matplotlib.patches import Polygon
  4 | from matplotlib.lines import Line2D
  5 | from pathlib import Path
  6 | import matplotlib.pyplot as plt
  7 | import matplotlib
  8 | 
  9 | matplotlib.use("Agg")
 10 | 
 11 | font = {"weight": "bold", "size": 16}  # 'family' : 'normal',
 12 | matplotlib.rcParams["pdf.fonttype"] = 42
 13 | matplotlib.rcParams["ps.fonttype"] = 42
 14 | fontsize_axis = 17
 15 | font_size_alone = 14
 16 | matplotlib.rc("font", **font)
 17 | 
 18 | markers = ["o", "s", "v", "^"]
 19 | linestyles = ["-", "--", "-.", ":"]
 20 | 
 21 | colors_blind = [
 22 |     ["blue", (0, 114.0 / 255, 178.0 / 255)],
 23 |     ["orange", (230.0 / 255, 159.0 / 255, 0)],
 24 |     ["reddish_purple", (204.0 / 255, 121.0 / 255, 167.0 / 255)],
 25 |     ["black", (0, 0, 0)],
 26 |     ["bluish_green", (0, 158.0 / 255, 115.0 / 255)],
 27 |     ["sky_blue", (86.0 / 255, 180.0 / 255, 233.0 / 255)],
 28 |     ["vermillon", (213.0 / 255, 94.0 / 255, 0)],
 29 |     # ["yellow", (240.0 / 255, 228.0 / 255, 66.0 / 255)],
 30 | ]
 31 | 
 32 | 
 33 | def plot_multiple_cdf(
 34 |     Ys,
 35 |     n_bins,
 36 |     xmin,
 37 |     xmax,
 38 |     xlabel,
 39 |     ylabel,
 40 |     legend,
 41 |     ymin=0,
 42 |     ymax=1.05,
 43 |     xticks=None,
 44 |     xticks_labels=None,
 45 |     xscale="linear",
 46 |     yscale="linear",
 47 |     cumulative=True,
 48 |     figure=None,
 49 |     axes=None,
 50 |     offset=0,
 51 |     colors_arg=None,
 52 |     linestyles_arg=None,
 53 | ):
 54 |     if figure is not None and axes is not None:
 55 |         fig = figure
 56 |         ax = axes
 57 |     else:
 58 |         subplots = plt.subplots()
 59 |         fig, ax = subplots
 60 |     ax.set_xlabel(xlabel, fontsize=fontsize_axis)
 61 |     ax.set_ylabel(ylabel, fontsize=fontsize_axis)
 62 |     # title = title + " CDF"
 63 |     # plt.title("CDF", fontsize=fontsize_axis)
 64 | 
 65 |     ax.grid(linestyle="dotted")
 66 |     if len(Ys) == 1:
 67 |         i = 0
 68 |         Y = Ys[i]
 69 |         if colors_arg is not None:
 70 |             color = colors_arg[i][1]
 71 |         else:
 72 |             color = colors_blind[(i + offset) % len(colors_blind)][1]
 73 | 
 74 |         if linestyles_arg is not None:
 75 |             linestyle = linestyles[i]
 76 |         else:
 77 |             linestyle = linestyles[(i + offset) % len(linestyles)]
 78 | 
 79 |         n, bins, patches = ax.hist(
 80 |             Y,
 81 |             density=True,
 82 |             histtype="step",
 83 |             bins=n_bins,
 84 |             cumulative=cumulative,
 85 |             linewidth=1.35,
 86 |             color=color,
 87 |             linestyle=linestyle,
 88 |         )
 89 |         patches[0].set_xy(patches[0].get_xy()[1:-1])
 90 |     else:
 91 |         for i in range(0, len(Ys)):
 92 |             Y = Ys[i]
 93 |             if colors_arg is not None:
 94 |                 color = colors_arg[i][1]
 95 |             else:
 96 |                 color = colors_blind[(i + offset) % len(colors_blind)][1]
 97 | 
 98 |             if linestyles_arg is not None:
 99 |                 linestyle = linestyles_arg[i]
100 |             else:
101 |                 linestyle = linestyles[(i + offset) % len(linestyles)]
102 | 
103 |             n, bins, patches = ax.hist(
104 |                 Y,
105 |                 density=True,
106 |                 histtype="step",
107 |                 bins=n_bins,
108 |                 cumulative=cumulative,
109 |                 linewidth=1.35,
110 |                 label=legend[i],
111 |                 color=color,
112 |                 linestyle=linestyle,
113 |             )
114 |             patches[0].set_xy(patches[0].get_xy()[1:-1])
115 | 
116 |     # plt.xscale("symlog")
117 |     # xticks = ax.xaxis.get_major_ticks()
118 |     # xticks[1].label1.set_visible(False)
119 |     # # xticks[2].label1.set_visible(False)
120 |     # xticks[-2].label1.set_visible(False)
121 |     ax.set_xscale(xscale)
122 |     ax.set_yscale(yscale)
123 |     ax.set_xlim(left=xmin, right=xmax)
124 |     ax.set_ylim(bottom=ymin, top=ymax)
125 |     if xticks is not None:
126 |         ax.set_xticks(xticks)
127 |     # xtickNames = plt.setp(ax, xticklabels=[f"{r}" for r in x_ticks])
128 |     if xticks_labels is not None:
129 |         ax.set_xticklabels(xticks_labels)
130 | 
131 |     # Normalize the data to a proper PDF
132 |     # plt.tight_layout()
133 |     # plt.savefig(r"resources/figures/" + ofile + ".pdf")
134 |     return fig, ax
135 | 
136 | 
137 | def plot_multiple_error_bars(
138 |     X, Ys, Yerrs, xmin, xmax, ymin, ymax, xlabel, ylabel, xscale, yscale, labels
139 | ):
140 |     fig, ax = plt.subplots()
141 |     ax.set_xlabel(xlabel, fontsize=fontsize_axis)
142 | 
143 |     ax.set_ylabel(ylabel, fontsize=fontsize_axis)
144 |     ax.grid(linestyle="dotted")
145 | 
146 |     # x_ticks = [inf_born+1]
147 |     for i in range(len(Ys)):
148 |         Y = Ys[i]
149 |         Yerr = Yerrs[i]
150 |         lns1 = ax.errorbar(
151 |             X,
152 |             Y,
153 |             Yerr,
154 |             label=labels[i],
155 |             linewidth=0.5,
156 |             marker=markers[i % len(markers)],
157 |             markersize=1,
158 |             markeredgewidth=1,
159 |             capsize=2,
160 |         )
161 |     ax.set_xscale(xscale)
162 |     ax.set_yscale(yscale)
163 |     ax.set_xlim(left=xmin, right=xmax)
164 |     ax.set_ylim(bottom=ymin, top=ymax)
165 |     return fig, ax
166 | 
167 | 
168 | def plot_save(ofile: Path, is_tight_layout):
169 |     # check that dirs exits
170 |     if not ofile.parent.exists():
171 |         ofile.parent.mkdir(parents=True, exist_ok=True)
172 | 
173 |     if is_tight_layout:
174 |         plt.tight_layout()
175 |     # plt.show()
176 |     plt.savefig(ofile)
177 | 
178 |     # plt.clf()
179 | 
180 | 
181 | def homogenize_legend(ax, legend_location, legend_size=14):
182 |     handles, labels = ax.get_legend_handles_labels()
183 |     new_handles = []
184 |     for h in handles:
185 |         if isinstance(h, Line2D):
186 |             new_handles.append(h)
187 |         elif isinstance(h, Polygon):
188 |             new_handles.append(
189 |                 Line2D([], [], linestyle=h.get_linestyle(), color=h.get_edgecolor())
190 |             )
191 |     ax.legend(
192 |         loc=legend_location,
193 |         prop={"size": legend_size},
194 |         handles=new_handles,
195 |         labels=labels,
196 |     )
197 | 
198 | 
199 | def plot_scatter_multiple(
200 |     Xs,
201 |     Ys,
202 |     xmin,
203 |     xmax,
204 |     ymin,
205 |     ymax,
206 |     xscale,
207 |     yscale,
208 |     xlabel,
209 |     ylabel,
210 |     markers,
211 |     marker_colors,
212 |     marker_size,
213 | ):
214 |     fig, ax = plt.subplots()
215 | 
216 |     # ax.set_xlabel(title, fontsize=fontsize_axis)
217 |     # plt.title("CDF", fontsize=fontsize_axis)
218 | 
219 |     # x_ticks = [inf_born]
220 |     # x_ticks.extend(np.arange(inf_born, sup_born, xtick_interval))
221 |     # ax.set_xticks(x_ticks)
222 |     # xtickNames = plt.setp(ax, xticklabels=["{0:.1f}".format(r) for r in x_ticks])
223 |     # ax.set_xticklabels(xtickNames, rotation=45)
224 |     # ax.set_xticklabels(xtickNames)
225 | 
226 |     ax.grid(linestyle="dotted")
227 |     ax.set_xlabel(xlabel, fontsize=fontsize_axis)
228 |     ax.set_ylabel(ylabel, fontsize=fontsize_axis)
229 | 
230 |     for i in range(0, len(Xs)):
231 |         X = Xs[i]
232 |         Y = Ys[i]
233 | 
234 |         # , markersize=10, markeredgewidth=2)
235 |         ax.scatter(X, Y, c=marker_colors[i], marker=markers[i], s=marker_size[i])
236 |         # ax.plot(X, Y)
237 |         # patches[0].set_xy(patches[0].get_xy()[:-1])
238 |     ax.set_xscale(xscale)
239 |     ax.set_yscale(yscale)
240 | 
241 |     ax.set_xlim(left=xmin, right=xmax)
242 |     ax.set_ylim(bottom=ymin, top=ymax)
243 | 
244 |     return fig, ax
245 | 


--------------------------------------------------------------------------------