├── .env.example
├── .gitignore
├── README.md
├── analysis
├── million_scale.ipynb
├── million_scale.py
├── plot.ipynb
├── ripe_atlas_probes_bias.ipynb
└── tables.ipynb
├── clickhouse_files
├── init-db.sh
└── users.d
│ └── default.xml
├── datasets
└── create_datasets.ipynb
├── default.py
├── install.sh
├── logger.py
├── measurements
├── landmark_traceroutes.ipynb
├── million_scale_measurements.ipynb
└── million_scale_measurements.py
├── poetry.lock
├── pyproject.toml
└── scripts
├── analysis
└── analysis.py
├── ripe_atlas
├── atlas_api.py
└── ping_and_traceroute_classes.py
├── street_level
├── landmark.py
├── three_tiers.py
└── traceroutes_results.py
└── utils
├── clickhouse.py
├── clickhouse_installer.py
├── credentials.py
├── file_utils.py
├── helpers.py
├── measurement_utils.py
└── plot_utils.py
/.env.example:
--------------------------------------------------------------------------------
1 | RIPE_USERNAME=
2 | RIPE_SECRET_KEY=
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # output files
2 | *.fsdb
3 | *.pdf
4 | *.csv
5 | *.dat
6 | *.tif
7 | *.tree
8 | *.zst
9 | *.json
10 | *.dat
11 | *.txt
12 |
13 | measurements/results
14 | clickhouse_files/data/
15 | clickhouse_files/logs/
16 | clickhouse_files/clickhouse
17 |
18 | # Byte-compiled / optimized / DLL files
19 | __pycache__/
20 | *.py[cod]
21 | *$py.class
22 |
23 | # C extensions
24 | *.so
25 |
26 | # Distribution / packaging
27 | .Python
28 | build/
29 | develop-eggs/
30 | dist/
31 | downloads/
32 | eggs/
33 | .eggs/
34 | lib/
35 | lib64/
36 | parts/
37 | sdist/
38 | var/
39 | wheels/
40 | share/python-wheels/
41 | *.egg-info/
42 | .installed.cfg
43 | *.egg
44 | MANIFEST
45 |
46 | # PyInstaller
47 | # Usually these files are written by a python script from a template
48 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
49 | *.manifest
50 | *.spec
51 |
52 | # Installer logs
53 | pip-log.txt
54 | pip-delete-this-directory.txt
55 |
56 | # Unit test / coverage reports
57 | htmlcov/
58 | .tox/
59 | .nox/
60 | .coverage
61 | .coverage.*
62 | .cache
63 | nosetests.xml
64 | coverage.xml
65 | *.cover
66 | *.py,cover
67 | .hypothesis/
68 | .pytest_cache/
69 | cover/
70 |
71 | # Translations
72 | *.mo
73 | *.pot
74 |
75 | # Django stuff:
76 | *.log
77 | local_settings.py
78 | db.sqlite3
79 | db.sqlite3-journal
80 |
81 | # Flask stuff:
82 | instance/
83 | .webassets-cache
84 |
85 | # Scrapy stuff:
86 | .scrapy
87 |
88 | # Sphinx documentation
89 | docs/_build/
90 |
91 | # PyBuilder
92 | .pybuilder/
93 | target/
94 |
95 | # Jupyter Notebook
96 | .ipynb_checkpoints
97 |
98 | # IPython
99 | profile_default/
100 | ipython_config.py
101 |
102 | # pyenv
103 | # For a library or package, you might want to ignore these files since the code is
104 | # intended to run in multiple environments; otherwise, check them in:
105 | # .python-version
106 |
107 | # pipenv
108 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
110 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
111 | # install all needed dependencies.
112 | #Pipfile.lock
113 |
114 | # poetry
115 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
116 | # This is especially recommended for binary packages to ensure reproducibility, and is more
117 | # commonly ignored for libraries.
118 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
119 | #poetry.lock
120 |
121 | # pdm
122 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
123 | #pdm.lock
124 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
125 | # in version control.
126 | # https://pdm.fming.dev/#use-with-ide
127 | .pdm.toml
128 |
129 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
130 | __pypackages__/
131 |
132 | # Celery stuff
133 | celerybeat-schedule
134 | celerybeat.pid
135 |
136 | # SageMath parsed files
137 | *.sage.py
138 |
139 | # Environments
140 | .env
141 | .venv
142 | env/
143 | venv/
144 | ENV/
145 | env.bak/
146 | venv.bak/
147 |
148 | # Spyder project settings
149 | .spyderproject
150 | .spyproject
151 |
152 | # Rope project settings
153 | .ropeproject
154 |
155 | # mkdocs documentation
156 | /site
157 |
158 | # mypy
159 | .mypy_cache/
160 | .dmypy.json
161 | dmypy.json
162 |
163 | # Pyre type checker
164 | .pyre/
165 |
166 | # pytype static type analyzer
167 | .pytype/
168 |
169 | # Cython debug symbols
170 | cython_debug/
171 |
172 | # PyCharm
173 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175 | # and can be added to the global gitignore or merged into this file. For a more nuclear
176 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
177 | #.idea/
178 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🗺️ Replication: Towards a Publicly Available Internet scale IP Geolocation Dataset (IMC 2023)
2 | This repository contains the code needed to reproduce and replicate our results in our [IMC 2023 paper]().
3 |
4 | Our study replicates the methodology of two papers that obtained outstanding results on geolocating IP addresses in terms of coverage and accuracy in nowadays Internet on the largest publicly available measurement platform, RIPE Atlas.
5 | These two papers are:
6 |
7 | 1. [Towards geolocation of millions of IP addresses (IMC 2012)](https://dl.acm.org/doi/abs/10.1145/2398776.2398790)
8 |
9 | 2. [Towards Street-Level Client-Independent IP Geolocation (NSDI 2011)](https://www.usenix.org/legacy/event/nsdi11/tech/full_papers/Wang_Yong.pdf).
10 |
11 | They are called million scale and street level papers throughout this README, as done in our paper.
12 |
13 | Our code offers the possibility to:
14 | 1. reproduce our results using our measurement datasets.
15 | 2. replicate our methodology with different targets and vantage points. For now, only RIPE Atlas vantage points are supported, but it should not be difficult to adapt the code to handle other vantage points and targets.
16 |
17 | ## Prerequisites
18 | Our code performs measurements on RIPE Atlas, so be sure to have an account if you want to replicate our methodology with your own RIPE Atlas measurements.
19 |
20 | ⚠️ **To replicate our RIPE Atlas measurements, you will need a lot of credits (millions)**.
21 |
22 |
23 | ## Table of contents
24 |
25 | - [Installation](#installation)
26 | - [Requirements](#requirements)
27 | - [Download datasets](#download-datasets)
28 | - [Clone the repository](#clone-the-repository)
29 | - [Installer](#installer)
30 | - [Install source files](#install-source-files)
31 | - [Clickhouse](#clickhouse)
32 | - [Settings](#settings)
33 | - [Further notice](#further-notice)
34 | - [Reproduction](#reproduction)
35 | - [Run your own measurements](#run-your-own-measurements)
36 |
37 | ## [Installation](#installation)
38 |
39 | ### [Requirements](#requirements)
40 |
41 | - [Python3.9](https://www.python.org/downloads/) (or above)
42 | - [Poetry](https://python-poetry.org/docs/)
43 | - [Docker](https://docs.docker.com/engine/install/)
44 |
45 |
46 | ### [Download datasets](#download-datasets)
47 |
48 | You can fetch our data our on FTP ftp.iris.dioptra.io that will give you the ClickHouse tables dumped in CSV format.
49 |
50 | ### [Clone the reprository](#clone-the-repository)
51 |
52 | ```bash
53 | git clone https://github.com/dioptra-io/geoloc-imc-2023.git
54 | cd geoloc-imc-2023
55 | ```
56 |
57 | ### [Installer](#installer)
58 |
59 | You can use the script **install.sh** to:
60 | - Pull the clickhouse docker image.
61 | - Start the clickhouse server.
62 | - Download clickhouse-client binary.
63 | - Install python project using poetry.
64 | - Create all tables and populate the database with our measurements.
65 |
66 | ```bash
67 | source install.sh
68 | ```
69 | If the installation fails, all necessary steps to use the project are described below.
70 |
71 | ### [Install source files](#install-source-files)
72 |
73 | GeoScale uses poetry has dependency manager, install the project using:
74 | ```bash
75 | poetry shell
76 | poetry lock
77 | poetry install
78 | ```
79 |
80 | ### [Clickhouse](#clickhouse)
81 |
82 | We use docker to run clickhouse server, by default server is listening on localhost on port 8123 and tcp9000. If you prefer using your own docker configuration, please also modify [default.py](default.py)
83 | ```bash
84 |
85 | # pull the docker image
86 | docker pull clickhouse/clickhouse-server:22.6
87 |
88 | # start the server
89 | docker run --rm -d \
90 | -v ./clickhouse_files/data:/var/lib/clickhouse/ \
91 | -v ./clickhouse_files/logs:/var/log/clickhouse-server/ \
92 | -v ./clickhouse_files/users.d:/etc/clickhouse-server/users.d:ro \
93 | -v ./clickhouse_files/init-db.sh:/docker-entrypoint-initdb.d/init-db.sh \
94 | -p 8123:8123 \
95 | -p 9000:9000 \
96 | --ulimit nofile=262144:262144 \
97 | clickhouse/clickhouse-server:22.6
98 | ```
99 |
100 | You can either install [clickhouse-client](https://clickhouse.com/docs/en/install) or download clikhouse client binary (by default, [install.sh](install.sh) download binary file).
101 | ```bash
102 | curl https://clickhouse.com/ | sh
103 | mv clickhouse ./clickhouse_files/
104 | ```
105 |
106 | Finally, create all necessary tables and populate it with our own measurements with:
107 | ```bash
108 | python scripts/utils/clickhouse_installer.py
109 | ```
110 |
111 |
112 | ### [Setttings](#settings)
113 |
114 | Our tool relies on ENV variables for configuring clickhouse or interacting with RIPE Atlas API.
115 | An example of necessary ENV variables is given in [.env.example](.env.example). Create your own
116 | env file with following values:
117 | ```.env
118 | RIPE_USERNAME=
119 | RIPE_SECRET_KEY=
120 | ```
121 |
122 | ⚠️ **IF** you used, your own clickhouse configuration, you can modify the following ENV:
123 | ```
124 | # clickhouse settings
125 | CLICKHOUSE_CLIENT=
126 | CLICKHOUSE_HOST=
127 | CLICKHOUSE_DB=
128 | CLICKHOUSE_USER=
129 | CLICKHOUSE_PASSWORD=
130 | ```
131 | ### [Further notice](#notice)
132 |
133 | #### Test environment
134 |
135 | The project has been run on:
136 | - CentOS 7.5
137 | - Python 3.9
138 | - Server with 64GB RAM, 32 cores.
139 |
140 | ⚠️ Some scripts and analysis can use a lot of CPU and RAM (tens of GB) and last for hours.
141 |
142 |
143 | ## [Reproducing our results](#reproduction)
144 |
145 | We provide python scripts and jupyter notebooks to reproduce the results and the graphs that we got in replicating the million scale and the street level papers.
146 |
147 | ### Million Scale
148 |
149 | You can reproduce Million scale results using a jupyter notebook: [million_scale.ipynb](./analysis/million_scale.ipynb)
150 |
151 | Alternatively you can also use the python script in background, as some steps are vey long to execute (several hours):
152 | ```bash
153 | nohup python analysis/million_scale.py > output.log &
154 | ```
155 |
156 | All analysis results can be found in **./analysis/results**
157 |
158 | ### Street level
159 |
160 | ⚠️ The tier 1 of the Street-level replication (See the paper for more details) relies on results calculated by the million scale technique. You need to run the million scale notebook/scripts **before** running those of street-level.
161 |
162 | No additional steps are necessary to reproduce the street-level experiment.
163 |
164 | ### Generating figures
165 |
166 | You can directly use notebooks [plot.ipynb](./analysis/plot.ipynb) and [tables.ipynb](./analysis/tables.ipynb) to produce the figures and tables of our paper.
167 |
168 | ## [Run your own measurements](#run-your-own-measurements)
169 |
170 | You can also run your own measurements on custom datasets of targets (anchors) and vantage points (probes).
171 |
172 | ### First step: generate targets and vantage points datasets
173 |
174 | The jupyter notebook [create_dataset](./datasets/create_datasets.ipynb) will generate:
175 | - the set of probes (used as vantage points)
176 | - the set of anchors (used as targets)
177 | - filter both sets by removing problematic probes (wrongly geolocated for example)
178 |
179 | All generated files will be placed in /datasets/user_datasets.
180 |
181 | ### Second step: run measurements
182 |
183 | With [million_scale_measurements.ipynb](./measurements/million_scale_measurements.ipynb), you can select a subset of vantage points and targets and run measurements on RIPE Atlas.
184 |
185 | This script will start measurements for:
186 | 1. towards all targets from all vantage points
187 | 2. towards 3 responsive addresses for each target from all vantage points
188 |
189 | ⚠️ These measurements might cost a lot of RIPE Atlas credits and time if you run them on large datasets (default is only 2 targets and 4 vantage points).
190 |
191 | ### Third step: analyze your results
192 |
193 | Perform the analysis by using the same step described previously on your own measurements results and datasets by setting the boolean variable ```repro = True```, at the beginning of [million_scale.ipynb](./analysis/million_scale.ipynb) (or [million_scale.py](./analysis/million_scale.py) if you are using the script).
194 |
195 |
196 |
197 | TODO: Street level
198 |
199 | ## 📚 Publications
200 |
201 | ```bibtex
202 | @inproceedings{darwich2023replication,
203 | title={Replication: Towards a Publicly Available Internet scale IP Geolocation Dataset},
204 | author={Darwich, Omar and Rimlinger, Hugo and Dreyfus, Milo and Gouel, Matthieu and Vermeulen, Kevin},
205 | booktitle={Proceedings of the 2023 ACM on Internet Measurement Conference},
206 | pages={1--15},
207 | year={2023}
208 | }
209 | ```
210 |
211 |
212 | ## 🧑💻 Authors
213 |
214 | This project is the result of a collaboration between the [LAAS-CNRS](https://www.laas.fr/public/) and [Sorbonne Université](https://www.sorbonne-universite.fr/).
215 |
216 |
--------------------------------------------------------------------------------
/analysis/million_scale.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# First step of the analysis\n",
8 | "\n",
9 | "Preprocess results and save them before they can be plotted. \n",
10 | "\n",
11 | "To do after measurements notebooks"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from scripts.utils.file_utils import load_json, dump_json\n",
21 | "\n",
22 | "from scripts.analysis.analysis import *\n",
23 | "from default import *\n",
24 | "\n",
25 | "# set to True to use your own datasets/measurements\n",
26 | "run_repro = False\n",
27 | "if run_repro:\n",
28 | " # DATASET FILES\n",
29 | " PROBES_FILE = REPRO_PROBES_FILE\n",
30 | " PROBES_AND_ANCHORS_FILE = REPRO_PROBES_AND_ANCHORS_FILE\n",
31 | " FILTERED_PROBES_FILE = REPRO_FILTERED_PROBES_FILE\n",
32 | " GREEDY_PROBES_FILE = REPRO_GREEDY_PROBES_FILE\n",
33 | " PAIRWISE_DISTANCE_FILE = REPRO_PAIRWISE_DISTANCE_FILE\n",
34 | " VPS_TO_TARGET_TABLE = PROBES_TO_ANCHORS_PING_TABLE\n",
35 | " VPS_TO_PREFIX_TABLE = PROBES_TO_PREFIX_TABLE\n",
36 | "\n",
37 | " # RESULT FILES\n",
38 | " PROBES_TO_ANCHORS_RESULT_FILE = REPRO_PROBES_TO_ANCHORS_RESULT_FILE\n",
39 | " ROUND_BASED_ALGORITHM_FILE = REPRO_ROUND_BASED_ALGORITHM_FILE\n",
40 | " ACCURACY_VS_N_VPS_PROBES_FILE = REPRO_ACCURACY_VS_N_VPS_PROBES_FILE\n",
41 | " VP_SELECTION_ALGORITHM_PROBES_1_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_1_FILE\n",
42 | " VP_SELECTION_ALGORITHM_PROBES_3_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_3_FILE\n",
43 | " VP_SELECTION_ALGORITHM_PROBES_10_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_10_FILE\n",
44 | " \n",
45 | "else:\n",
46 | " # DATASET FILES\n",
47 | " PROBES_FILE = USER_PROBES_FILE\n",
48 | " PROBES_AND_ANCHORS_FILE = USER_PROBES_AND_ANCHORS_FILE\n",
49 | " FILTERED_PROBES_FILE = USER_FILTERED_PROBES_FILE\n",
50 | " GREEDY_PROBES_FILE = USER_GREEDY_PROBES_FILE\n",
51 | " PAIRWISE_DISTANCE_FILE = USER_PAIRWISE_DISTANCE_FILE\n",
52 | " VPS_TO_TARGET_TABLE = USER_VPS_TO_TARGET_TABLE\n",
53 | " VPS_TO_PREFIX_TABLE = USER_VPS_TO_PREFIX_TABLE\n",
54 | "\n",
55 | " # RESULT FILES\n",
56 | " PROBES_TO_ANCHORS_RESULT_FILE = USER_PROBES_TO_ANCHORS_RESULT_FILE\n",
57 | " ROUND_BASED_ALGORITHM_FILE = USER_ROUND_BASED_ALGORITHM_FILE\n",
58 | " ACCURACY_VS_N_VPS_PROBES_FILE = USER_ACCURACY_VS_N_VPS_PROBES_FILE\n",
59 | " VP_SELECTION_ALGORITHM_PROBES_1_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_1_FILE\n",
60 | " VP_SELECTION_ALGORITHM_PROBES_3_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_3_FILE\n",
61 | " VP_SELECTION_ALGORITHM_PROBES_10_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_10_FILE\n",
62 | "\n",
63 | "LIMIT = 1000"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 3,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "filtered_probes = load_json(FILTERED_PROBES_FILE)\n",
73 | "\n",
74 | "filter = \"\"\n",
75 | "if len(filtered_probes) > 0:\n",
76 | " # Remove probes that are wrongly geolocated\n",
77 | " in_clause = f\"\".join(\n",
78 | " [f\",toIPv4('{p}')\" for p in filtered_probes])[1:]\n",
79 | " filter += f\"AND dst not in ({in_clause}) AND src not in ({in_clause}) \"\n"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "## Compute errors\n",
87 | "\n",
88 | "Compute the median error between the guessed geolocations and the real geolocations"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 4,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n",
98 | "vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, vp_distance_matrix, probes_per_ip = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 5,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "rtt_per_srcs_dst = compute_rtts_per_dst_src(VPS_TO_TARGET_TABLE, filter, threshold=70)\n",
108 | "\n",
109 | "vps_per_target = {dst: set(vp_coordinates_per_ip.keys())\n",
110 | " for dst in rtt_per_srcs_dst}\n",
111 | "features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip, THRESHOLD_DISTANCES,\n",
112 | " vps_per_target=vps_per_target,\n",
113 | " distance_operator=\">\", max_vps=100000,\n",
114 | " is_use_prefix=False,\n",
115 | " vp_distance_matrix=vp_distance_matrix,\n",
116 | " )\n",
117 | "\n",
118 | "dump_json(features, PROBES_TO_ANCHORS_RESULT_FILE)"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "## Round Algorithm\n",
126 | "\n",
127 | "First is to use a subset of greedy probes, and then take 1 probe/AS in the given CBG area to compute the median error."
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 6,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n",
137 | "\n",
138 | "asn_per_vp_ip = {}\n",
139 | "vp_coordinates_per_ip = {}\n",
140 | "\n",
141 | "for probe in all_probes:\n",
142 | " if \"address_v4\" in probe and \"geometry\" in probe and \"coordinates\" in probe[\"geometry\"]:\n",
143 | " ip_v4_address = probe[\"address_v4\"]\n",
144 | " if ip_v4_address is None:\n",
145 | " continue\n",
146 | " long, lat = probe[\"geometry\"][\"coordinates\"]\n",
147 | " asn_v4 = probe[\"asn_v4\"]\n",
148 | " asn_per_vp_ip[ip_v4_address] = asn_v4\n",
149 | " vp_coordinates_per_ip[ip_v4_address] = lat, long\n"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 7,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "# clickhouse is required here\n",
159 | "rtt_per_srcs_dst = compute_rtts_per_dst_src(VPS_TO_TARGET_TABLE, filter, threshold=100)"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 8,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "vp_distance_matrix = load_json(PAIRWISE_DISTANCE_FILE)"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 9,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "Using 10 tier1_vps\n",
181 | "Using 100 tier1_vps\n",
182 | "Using 300 tier1_vps\n",
183 | "Using 500 tier1_vps\n",
184 | "Using 1000 tier1_vps\n"
185 | ]
186 | }
187 | ],
188 | "source": [
189 | "TIER1_VPS = [10, 100, 300, 500, 1000]\n",
190 | "greedy_probes = load_json(GREEDY_PROBES_FILE)\n",
191 | "error_cdf_per_tier1_vps = {}\n",
192 | "for tier1_vps in TIER1_VPS:\n",
193 | " print(f\"Using {tier1_vps} tier1_vps\")\n",
194 | " error_cdf = round_based_algorithm(greedy_probes, rtt_per_srcs_dst, vp_coordinates_per_ip,\n",
195 | " asn_per_vp_ip,\n",
196 | " tier1_vps,\n",
197 | " threshold=40)\n",
198 | " error_cdf_per_tier1_vps[tier1_vps] = error_cdf\n",
199 | " \n",
200 | "dump_json(error_cdf_per_tier1_vps, ROUND_BASED_ALGORITHM_FILE)"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "## Accuracy vs number of vps probes\n",
208 | "WARNING : Time consumming section \n",
209 | "\n",
210 | "Compute median error for each target, depending on the number of initial VPs."
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 10,
216 | "metadata": {},
217 | "outputs": [],
218 | "source": [
219 | "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n",
220 | "\n",
221 | "vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, \\\n",
222 | " vp_distance_matrix, probe_per_ip = compute_geo_info(\n",
223 | " all_probes, serialized_file=PAIRWISE_DISTANCE_FILE)"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 12,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "name": "stderr",
233 | "output_type": "stream",
234 | "text": [
235 | "2023-09-13 16:22:03::INFO:root:analysis:: Starting computing for random VPs 100\n",
236 | "2023-09-13 16:23:13::INFO:root:analysis:: Starting computing for random VPs 200\n",
237 | "2023-09-13 16:24:21::INFO:root:analysis:: Starting computing for random VPs 300\n",
238 | "2023-09-13 16:25:31::INFO:root:analysis:: Starting computing for random VPs 400\n"
239 | ]
240 | }
241 | ],
242 | "source": [
243 | "subset_sizes = []\n",
244 | "subset_sizes.extend([i for i in range(100, 500, 100)])\n",
245 | "# subset_sizes.extend([i for i in range(1000, 10001, 1000)])\n",
246 | "\n",
247 | "rtt_per_srcs_dst = compute_rtts_per_dst_src(VPS_TO_TARGET_TABLE, filter, threshold=50)\n",
248 | "\n",
249 | "available_vps = list(vp_coordinates_per_ip.keys())\n",
250 | "accuracy_vs_nb_vps = compute_accuracy_vs_number_of_vps(available_vps, rtt_per_srcs_dst, vp_coordinates_per_ip,\n",
251 | " vp_distance_matrix, subset_sizes)\n",
252 | "\n",
253 | "dump_json(accuracy_vs_nb_vps, ACCURACY_VS_N_VPS_PROBES_FILE)"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "## VPs selection algorithm\n",
261 | "\n",
262 | "Select respectively the 1, 3, and 10 closest probes (with minimal round trip time) for each target."
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 13,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n",
272 | "\n",
273 | "vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, vp_distance_matrix, probes_per_ip = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 16,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "ping_table_prefix = VPS_TO_PREFIX_TABLE\n",
283 | "ping_table = VPS_TO_TARGET_TABLE\n",
284 | "N_VPS_SELECTION_ALGORITHM = [1, 3, 10]\n",
285 | "results_files = [VP_SELECTION_ALGORITHM_PROBES_1_FILE, VP_SELECTION_ALGORITHM_PROBES_3_FILE, VP_SELECTION_ALGORITHM_PROBES_10_FILE]\n",
286 | "\n",
287 | "rtt_per_srcs_dst_prefix = compute_rtts_per_dst_src(ping_table_prefix, filter, threshold=100, is_per_prefix=True)\n",
288 | "rtt_per_srcs_dst = compute_rtts_per_dst_src(ping_table, filter, threshold=70)\n",
289 | "\n",
290 | "for i, n_vp in enumerate(N_VPS_SELECTION_ALGORITHM):\n",
291 | " vps_per_target = compute_closest_rtt_probes(rtt_per_srcs_dst_prefix,\n",
292 | " vp_coordinates_per_ip,\n",
293 | " vp_distance_matrix,\n",
294 | " n_shortest=n_vp,\n",
295 | " is_prefix=True)\n",
296 | " features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip,\n",
297 | " [0],\n",
298 | " vps_per_target=vps_per_target,\n",
299 | " distance_operator=\">\", max_vps=100000,\n",
300 | " is_use_prefix=True,\n",
301 | " vp_distance_matrix=vp_distance_matrix,\n",
302 | " is_multiprocess=True)\n",
303 | " \n",
304 | " ofile = results_files[i]\n",
305 | " dump_json(features, ofile)"
306 | ]
307 | }
308 | ],
309 | "metadata": {
310 | "kernelspec": {
311 | "display_name": "review-fXCvvitn-py3.10",
312 | "language": "python",
313 | "name": "python3"
314 | },
315 | "language_info": {
316 | "codemirror_mode": {
317 | "name": "ipython",
318 | "version": 3
319 | },
320 | "file_extension": ".py",
321 | "mimetype": "text/x-python",
322 | "name": "python",
323 | "nbconvert_exporter": "python",
324 | "pygments_lexer": "ipython3",
325 | "version": "3.10.9"
326 | },
327 | "orig_nbformat": 4
328 | },
329 | "nbformat": 4,
330 | "nbformat_minor": 2
331 | }
332 |
--------------------------------------------------------------------------------
/analysis/million_scale.py:
--------------------------------------------------------------------------------
1 | from scripts.utils.file_utils import load_json, dump_json
2 |
3 | from scripts.analysis.analysis import *
4 | from default import *
5 |
6 |
7 | if __name__ == "__main__":
8 | # set to True to use your own datasets/measurements
9 | run_repro = True
10 | if run_repro:
11 | # DATASET FILES
12 | PROBES_FILE = REPRO_PROBES_FILE
13 | PROBES_AND_ANCHORS_FILE = REPRO_PROBES_AND_ANCHORS_FILE
14 | FILTERED_PROBES_FILE = REPRO_FILTERED_PROBES_FILE
15 | GREEDY_PROBES_FILE = REPRO_GREEDY_PROBES_FILE
16 | PAIRWISE_DISTANCE_FILE = REPRO_PAIRWISE_DISTANCE_FILE
17 | VPS_TO_TARGET_TABLE = PROBES_TO_ANCHORS_PING_TABLE
18 | VPS_TO_PREFIX_TABLE = PROBES_TO_PREFIX_TABLE
19 |
20 | # RESULT FILES
21 | PROBES_TO_ANCHORS_RESULT_FILE = REPRO_PROBES_TO_ANCHORS_RESULT_FILE
22 | ROUND_BASED_ALGORITHM_FILE = REPRO_ROUND_BASED_ALGORITHM_FILE
23 | ACCURACY_VS_N_VPS_PROBES_FILE = REPRO_ACCURACY_VS_N_VPS_PROBES_FILE
24 | VP_SELECTION_ALGORITHM_PROBES_1_FILE = (
25 | REPRO_VP_SELECTION_ALGORITHM_PROBES_1_FILE
26 | )
27 | VP_SELECTION_ALGORITHM_PROBES_3_FILE = (
28 | REPRO_VP_SELECTION_ALGORITHM_PROBES_3_FILE
29 | )
30 | VP_SELECTION_ALGORITHM_PROBES_10_FILE = (
31 | REPRO_VP_SELECTION_ALGORITHM_PROBES_10_FILE
32 | )
33 |
34 | else:
35 | # DATASET FILES
36 | PROBES_FILE = USER_PROBES_FILE
37 | PROBES_AND_ANCHORS_FILE = USER_PROBES_AND_ANCHORS_FILE
38 | FILTERED_PROBES_FILE = USER_FILTERED_PROBES_FILE
39 | GREEDY_PROBES_FILE = USER_GREEDY_PROBES_FILE
40 | PAIRWISE_DISTANCE_FILE = USER_PAIRWISE_DISTANCE_FILE
41 | VPS_TO_TARGET_TABLE = USER_VPS_TO_TARGET_TABLE
42 | VPS_TO_PREFIX_TABLE = USER_VPS_TO_PREFIX_TABLE
43 |
44 | # RESULT FILES
45 | PROBES_TO_ANCHORS_RESULT_FILE = USER_PROBES_TO_ANCHORS_RESULT_FILE
46 | ROUND_BASED_ALGORITHM_FILE = USER_ROUND_BASED_ALGORITHM_FILE
47 | ACCURACY_VS_N_VPS_PROBES_FILE = USER_ACCURACY_VS_N_VPS_PROBES_FILE
48 | VP_SELECTION_ALGORITHM_PROBES_1_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_1_FILE
49 | VP_SELECTION_ALGORITHM_PROBES_3_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_3_FILE
50 | VP_SELECTION_ALGORITHM_PROBES_10_FILE = (
51 | USER_VP_SELECTION_ALGORITHM_PROBES_10_FILE
52 | )
53 |
54 | LIMIT = 1000
55 |
56 | filtered_probes = load_json(FILTERED_PROBES_FILE)
57 |
58 | filter = ""
59 | if len(filtered_probes) > 0:
60 | # Remove probes that are wrongly geolocated
61 | in_clause = f"".join([f",toIPv4('{p}')" for p in filtered_probes])[1:]
62 | filter += f"AND dst not in ({in_clause}) AND src not in ({in_clause}) "
63 |
64 | logger.info("Step 1: Compute errors")
65 |
66 | all_probes = load_json(PROBES_AND_ANCHORS_FILE)
67 | (
68 | vp_coordinates_per_ip,
69 | ip_per_coordinates,
70 | country_per_vp,
71 | asn_per_vp,
72 | vp_distance_matrix,
73 | probes_per_ip,
74 | ) = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)
75 |
76 | rtt_per_srcs_dst = compute_rtts_per_dst_src(
77 | PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=70
78 | )
79 |
80 | vps_per_target = {
81 | dst: set(vp_coordinates_per_ip.keys()) for dst in rtt_per_srcs_dst
82 | }
83 | features = compute_geolocation_features_per_ip(
84 | rtt_per_srcs_dst,
85 | vp_coordinates_per_ip,
86 | THRESHOLD_DISTANCES,
87 | vps_per_target=vps_per_target,
88 | distance_operator=">",
89 | max_vps=100000,
90 | is_use_prefix=False,
91 | vp_distance_matrix=vp_distance_matrix,
92 | )
93 |
94 | dump_json(features, PROBES_TO_ANCHORS_RESULT_FILE)
95 |
96 | logger.info("Step 2: Round Algorithm")
97 |
98 | all_probes = load_json(PROBES_AND_ANCHORS_FILE)
99 |
100 | asn_per_vp_ip = {}
101 | vp_coordinates_per_ip = {}
102 |
103 | for probe in all_probes:
104 | if (
105 | "address_v4" in probe
106 | and "geometry" in probe
107 | and "coordinates" in probe["geometry"]
108 | ):
109 | ip_v4_address = probe["address_v4"]
110 | if ip_v4_address is None:
111 | continue
112 | long, lat = probe["geometry"]["coordinates"]
113 | asn_v4 = probe["asn_v4"]
114 | asn_per_vp_ip[ip_v4_address] = asn_v4
115 | vp_coordinates_per_ip[ip_v4_address] = lat, long
116 |
117 | # clickhouse is required here
118 | rtt_per_srcs_dst = compute_rtts_per_dst_src(
119 | PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=100
120 | )
121 | vp_distance_matrix = load_json(PAIRWISE_DISTANCE_FILE)
122 |
123 | TIER1_VPS = [10, 100, 300, 500, 1000]
124 | greedy_probes = load_json(GREEDY_PROBES_FILE)
125 | error_cdf_per_tier1_vps = {}
126 | for tier1_vps in TIER1_VPS:
127 | print(f"Using {tier1_vps} tier1_vps")
128 | error_cdf = round_based_algorithm(
129 | greedy_probes,
130 | rtt_per_srcs_dst,
131 | vp_coordinates_per_ip,
132 | asn_per_vp_ip,
133 | tier1_vps,
134 | threshold=40,
135 | )
136 | error_cdf_per_tier1_vps[tier1_vps] = error_cdf
137 |
138 | dump_json(error_cdf_per_tier1_vps, ROUND_BASED_ALGORITHM_FILE)
139 |
140 | logger.info("Accuracy vs number of vps probes")
141 | logger.warning("this step might takes several hours")
142 |
143 | all_probes = load_json(PROBES_AND_ANCHORS_FILE)
144 |
145 | (
146 | vp_coordinates_per_ip,
147 | ip_per_coordinates,
148 | country_per_vp,
149 | asn_per_vp,
150 | vp_distance_matrix,
151 | probe_per_ip,
152 | ) = compute_geo_info(all_probes, serialized_file=PAIRWISE_DISTANCE_FILE)
153 |
154 | logger.info("Accuracy vs number of vps probes")
155 |
156 | subset_sizes = []
157 | subset_sizes.extend([i for i in range(100, 1000, 100)])
158 | # subset_sizes.extend([i for i in range(1000, 10001, 1000)])
159 |
160 | rtt_per_srcs_dst = compute_rtts_per_dst_src(
161 | PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=50
162 | )
163 |
164 | available_vps = list(vp_coordinates_per_ip.keys())
165 | accuracy_vs_nb_vps = compute_accuracy_vs_number_of_vps(
166 | available_vps,
167 | rtt_per_srcs_dst,
168 | vp_coordinates_per_ip,
169 | vp_distance_matrix,
170 | subset_sizes,
171 | )
172 |
173 | dump_json(accuracy_vs_nb_vps, ACCURACY_VS_N_VPS_PROBES_FILE)
174 |
175 | logger.info("vp selection algorithm")
176 |
177 | all_probes = load_json(PROBES_AND_ANCHORS_FILE)
178 |
179 | (
180 | vp_coordinates_per_ip,
181 | ip_per_coordinates,
182 | country_per_vp,
183 | asn_per_vp,
184 | vp_distance_matrix,
185 | probes_per_ip,
186 | ) = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)
187 |
188 | ping_table_prefix = PROBES_TO_PREFIX_TABLE
189 | ping_table = PROBES_TO_ANCHORS_PING_TABLE
190 | N_VPS_SELECTION_ALGORITHM = [1, 3, 10]
191 | results_files = [
192 | VP_SELECTION_ALGORITHM_PROBES_1_FILE,
193 | VP_SELECTION_ALGORITHM_PROBES_3_FILE,
194 | VP_SELECTION_ALGORITHM_PROBES_10_FILE,
195 | ]
196 |
197 | rtt_per_srcs_dst_prefix = compute_rtts_per_dst_src(
198 | ping_table_prefix, filter, threshold=100, is_per_prefix=True
199 | )
200 | rtt_per_srcs_dst = compute_rtts_per_dst_src(ping_table, filter, threshold=70)
201 |
202 | for i, n_vp in enumerate(N_VPS_SELECTION_ALGORITHM):
203 | vps_per_target = compute_closest_rtt_probes(
204 | rtt_per_srcs_dst_prefix,
205 | vp_coordinates_per_ip,
206 | vp_distance_matrix,
207 | n_shortest=n_vp,
208 | is_prefix=True,
209 | )
210 | features = compute_geolocation_features_per_ip(
211 | rtt_per_srcs_dst,
212 | vp_coordinates_per_ip,
213 | [0],
214 | vps_per_target=vps_per_target,
215 | distance_operator=">",
216 | max_vps=100000,
217 | is_use_prefix=True,
218 | vp_distance_matrix=vp_distance_matrix,
219 | is_multiprocess=True,
220 | )
221 |
222 | ofile = results_files[i]
223 | dump_json(features, ofile)
224 |
--------------------------------------------------------------------------------
/analysis/ripe_atlas_probes_bias.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import json\n",
10 | "import pandas as pd\n",
11 | "\n",
12 | "from default import ASNS_TYPE_CAIDA, ASNS_TYPE_STANFORD, REPRO_PROBES_AND_ANCHORS_FILE, REPRO_ANCHORS_FILE, REPRO_PROBES_FILE"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "# load datasets"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "with ASNS_TYPE_CAIDA.open(\"r\") as f:\n",
29 | " asns_categories_caida = json.load(f)\n",
30 | "\n",
31 | "with ASNS_TYPE_STANFORD.open(\"r\") as f:\n",
32 | " asns_categories_stanford = json.load(f)\n",
33 | " \n",
34 | "with REPRO_PROBES_AND_ANCHORS_FILE.open(\"r\") as f:\n",
35 | " probes_and_anchors = json.load(f)\n",
36 | "\n",
37 | "with REPRO_PROBES_FILE.open(\"r\") as f:\n",
38 | " probes = json.load(f)\n",
39 | "\n",
40 | "with REPRO_ANCHORS_FILE.open(\"r\") as f:\n",
41 | " anchors = json.load(f)"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 3,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "def get_anchor_as_category(asns_category: dict, ripe_vps_dataset: dict) -> dict:\n",
51 | " \"\"\"return one category per anchor\"\"\"\n",
52 | " ripe_categories = []\n",
53 | "\n",
54 | " for ripe_vp in ripe_vps_dataset:\n",
55 | " try:\n",
56 | " ripe_categories.append({\n",
57 | " \"id\": ripe_vp['id'],\n",
58 | " \"category\": asns_category[str(ripe_vp[\"asn_v4\"])]\n",
59 | " })\n",
60 | " except KeyError:\n",
61 | " ripe_categories.append({\n",
62 | " \"id\": ripe_vp['id'],\n",
63 | " \"category\": \"Unknown\"\n",
64 | " })\n",
65 | " continue\n",
66 | " return ripe_categories\n",
67 | "\n",
68 | "def get_categories_percentage(categories_df: pd.DataFrame) -> dict:\n",
69 | " \"\"\"get percentage per categories from a set of categories\"\"\"\n",
70 | " category_repartition = dict()\n",
71 | "\n",
72 | " category_set = categories_df[\"category\"].unique()\n",
73 | " for category in category_set:\n",
74 | " percentage = len(categories_df[categories_df[\"category\"] == category]) * 100 / len(categories_df[\"id\"])\n",
75 | " category_repartition[category] = percentage\n",
76 | "\n",
77 | " print(f\"{category} : {len(categories_df[categories_df['category'] == category])} ({round(percentage,1)}%)\")\n",
78 | "\n",
79 | " assert round(sum([v for v in category_repartition.values()])) == 100 \n",
80 | "\n",
81 | " return category_repartition"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "# Get targets type"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 4,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "category_caida_anchors = get_anchor_as_category(asns_categories_caida, anchors)\n",
98 | "category_caida_probes = get_anchor_as_category(asns_categories_caida, probes)\n",
99 | "category_caida_probes_and_anchors = get_anchor_as_category(asns_categories_caida, probes_and_anchors)\n",
100 | "\n",
101 | "category_stanford_anchors = get_anchor_as_category(asns_categories_stanford, anchors)\n",
102 | "category_stanford_probes = get_anchor_as_category(asns_categories_stanford, probes)\n",
103 | "category_stanford_probes_and_anchors = get_anchor_as_category(asns_categories_stanford, probes_and_anchors)\n",
104 | "\n",
105 | "caida_df_anchors = pd.DataFrame(category_caida_anchors, columns=[\"id\", \"category\"])\n",
106 | "caida_df_probes = pd.DataFrame(category_caida_probes, columns=[\"id\", \"category\"])\n",
107 | "caida_df_probes_and_anchors = pd.DataFrame(category_caida_probes_and_anchors, columns=[\"id\", \"category\"])\n",
108 | "\n",
109 | "stanford_df_anchors = pd.DataFrame(category_stanford_anchors, columns=[\"id\", \"category\"])\n",
110 | "stanford_df_probes = pd.DataFrame(category_stanford_probes, columns=[\"id\", \"category\"])\n",
111 | "stanford_df_probes_and_anchors = pd.DataFrame(category_stanford_probes_and_anchors, columns=[\"id\", \"category\"])"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "# Caida categories"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 5,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "name": "stdout",
128 | "output_type": "stream",
129 | "text": [
130 | "Anchors results: \n",
131 | "\n",
132 | "Content : 229 (31.7%)\n",
133 | "Access : 211 (29.2%)\n",
134 | "Transit/Access : 197 (27.2%)\n",
135 | "Enterprise : 55 (7.6%)\n",
136 | "tier-1 : 6 (0.8%)\n",
137 | "Unknown : 25 (3.5%)\n",
138 | "\n",
139 | "Probes results: \n",
140 | "\n",
141 | "Access : 9124 (75.2%)\n",
142 | "Transit/Access : 1005 (8.3%)\n",
143 | "Enterprise : 410 (3.4%)\n",
144 | "Unknown : 312 (2.6%)\n",
145 | "Content : 1112 (9.2%)\n",
146 | "tier-1 : 166 (1.4%)\n",
147 | "\n",
148 | "Probes and anchors results: \n",
149 | "\n",
150 | "Access : 9347 (72.4%)\n",
151 | "Transit/Access : 1221 (9.5%)\n",
152 | "Enterprise : 472 (3.7%)\n",
153 | "Unknown : 339 (2.6%)\n",
154 | "Content : 1361 (10.5%)\n",
155 | "tier-1 : 174 (1.3%)\n",
156 | "\n"
157 | ]
158 | }
159 | ],
160 | "source": [
161 | "print(\"Anchors results: \\n\")\n",
162 | "ripe_vps_categories_caida = get_categories_percentage(caida_df_anchors)\n",
163 | "print()\n",
164 | "\n",
165 | "print(\"Probes results: \\n\")\n",
166 | "ripe_vps_categories_caida = get_categories_percentage(caida_df_probes)\n",
167 | "print()\n",
168 | "\n",
169 | "print(\"Probes and anchors results: \\n\")\n",
170 | "ripe_vps_categories_caida = get_categories_percentage(caida_df_probes_and_anchors)\n",
171 | "print()"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "# Stanford categories"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 6,
184 | "metadata": {},
185 | "outputs": [
186 | {
187 | "name": "stdout",
188 | "output_type": "stream",
189 | "text": [
190 | "Anchors results: \n",
191 | "\n",
192 | "Computer and Information Technology : 521 (72.1%)\n",
193 | "Education and Research : 38 (5.3%)\n",
194 | "Community Groups and Nonprofits : 33 (4.6%)\n",
195 | "Health Care Services : 2 (0.3%)\n",
196 | "Finance and Insurance : 6 (0.8%)\n",
197 | "Unknown : 53 (7.3%)\n",
198 | "Media, Publishing, and Broadcasting : 21 (2.9%)\n",
199 | "Service : 25 (3.5%)\n",
200 | "Construction and Real Estate : 5 (0.7%)\n",
201 | "Travel and Accommodation : 2 (0.3%)\n",
202 | "Government and Public Administration : 3 (0.4%)\n",
203 | "Retail Stores, Wholesale, and E-commerce Sites : 5 (0.7%)\n",
204 | "Utilities (Excluding Internet Service) : 1 (0.1%)\n",
205 | "Manufacturing : 2 (0.3%)\n",
206 | "Other : 4 (0.6%)\n",
207 | "Museums, Libraries, and Entertainment : 1 (0.1%)\n",
208 | "Freight, Shipment, and Postal Services : 1 (0.1%)\n",
209 | "\n",
210 | "Probes results: \n",
211 | "\n",
212 | "Computer and Information Technology : 10028 (82.7%)\n",
213 | "Community Groups and Nonprofits : 129 (1.1%)\n",
214 | "Unknown : 842 (6.9%)\n",
215 | "Education and Research : 352 (2.9%)\n",
216 | "Construction and Real Estate : 60 (0.5%)\n",
217 | "Manufacturing : 25 (0.2%)\n",
218 | "Service : 300 (2.5%)\n",
219 | "Media, Publishing, and Broadcasting : 183 (1.5%)\n",
220 | "Other : 14 (0.1%)\n",
221 | "Retail Stores, Wholesale, and E-commerce Sites : 105 (0.9%)\n",
222 | "Government and Public Administration : 18 (0.1%)\n",
223 | "Health Care Services : 8 (0.1%)\n",
224 | "Finance and Insurance : 22 (0.2%)\n",
225 | "Utilities (Excluding Internet Service) : 16 (0.1%)\n",
226 | "Museums, Libraries, and Entertainment : 8 (0.1%)\n",
227 | "Travel and Accommodation : 10 (0.1%)\n",
228 | "Agriculture, Mining, and Refineries (Farming, Greenhouses, Mining, Forestry, and Animal Farming) : 4 (0.0%)\n",
229 | "Freight, Shipment, and Postal Services : 5 (0.0%)\n",
230 | "\n",
231 | "Probes and anchors results: \n",
232 | "\n",
233 | "Computer and Information Technology : 10590 (82.0%)\n",
234 | "Community Groups and Nonprofits : 163 (1.3%)\n",
235 | "Unknown : 901 (7.0%)\n",
236 | "Education and Research : 393 (3.0%)\n",
237 | "Construction and Real Estate : 65 (0.5%)\n",
238 | "Manufacturing : 27 (0.2%)\n",
239 | "Service : 328 (2.5%)\n",
240 | "Media, Publishing, and Broadcasting : 206 (1.6%)\n",
241 | "Other : 19 (0.1%)\n",
242 | "Retail Stores, Wholesale, and E-commerce Sites : 115 (0.9%)\n",
243 | "Government and Public Administration : 21 (0.2%)\n",
244 | "Health Care Services : 10 (0.1%)\n",
245 | "Finance and Insurance : 28 (0.2%)\n",
246 | "Utilities (Excluding Internet Service) : 17 (0.1%)\n",
247 | "Museums, Libraries, and Entertainment : 9 (0.1%)\n",
248 | "Travel and Accommodation : 12 (0.1%)\n",
249 | "Agriculture, Mining, and Refineries (Farming, Greenhouses, Mining, Forestry, and Animal Farming) : 4 (0.0%)\n",
250 | "Freight, Shipment, and Postal Services : 6 (0.0%)\n",
251 | "\n"
252 | ]
253 | }
254 | ],
255 | "source": [
256 | "print(\"Anchors results: \\n\")\n",
257 | "ripe_vps_categories_caida = get_categories_percentage(stanford_df_anchors)\n",
258 | "print()\n",
259 | "\n",
260 | "print(\"Probes results: \\n\")\n",
261 | "ripe_vps_categories_caida = get_categories_percentage(stanford_df_probes)\n",
262 | "print()\n",
263 | "\n",
264 | "print(\"Probes and anchors results: \\n\")\n",
265 | "ripe_vps_categories_caida = get_categories_percentage(stanford_df_probes_and_anchors)\n",
266 | "print()"
267 | ]
268 | }
269 | ],
270 | "metadata": {
271 | "kernelspec": {
272 | "display_name": "review-QY-dYH-y-py3.10",
273 | "language": "python",
274 | "name": "python3"
275 | },
276 | "language_info": {
277 | "codemirror_mode": {
278 | "name": "ipython",
279 | "version": 3
280 | },
281 | "file_extension": ".py",
282 | "mimetype": "text/x-python",
283 | "name": "python",
284 | "nbconvert_exporter": "python",
285 | "pygments_lexer": "ipython3",
286 | "version": "3.10.9"
287 | },
288 | "orig_nbformat": 4
289 | },
290 | "nbformat": 4,
291 | "nbformat_minor": 2
292 | }
293 |
--------------------------------------------------------------------------------
/analysis/tables.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Print tables\n",
8 | "\n",
9 | "Print all the tables of the replication paper \n",
10 | "To do after analysis/million_scale.ipynb"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "import pyasn\n",
20 | "\n",
21 | "from ipaddress import ip_network\n",
22 | "from clickhouse_driver import Client\n",
23 | "\n",
24 | "from scripts.utils.file_utils import load_json\n",
25 | "from scripts.utils.clickhouse import Clickhouse\n",
26 | "from scripts.analysis.analysis import get_all_bgp_prefixes, is_same_bgp_prefix, every_tier_result_and_errors\n",
27 | "from scripts.utils.helpers import haversine\n",
28 | "from default import IP_TO_ASN_FILE, ANALYZABLE_FILE, ROUND_BASED_ALGORITHM_FILE, TARGET_TO_LANDMARKS_PING_TABLE"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "## Measurement overhead"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "### Figure 3.c of the replication paper"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "round_based_algorithm_results = load_json(ROUND_BASED_ALGORITHM_FILE)\n",
52 | "\n",
53 | "round_based_algorithm_results = {\n",
54 | "int(x): round_based_algorithm_results[x] for x in round_based_algorithm_results}"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 3,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "name": "stdout",
64 | "output_type": "stream",
65 | "text": [
66 | "10 5785182\n",
67 | "100 4459050\n",
68 | "300 3205290\n",
69 | "500 2800245\n",
70 | "1000 2817933\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "for tier1_vps, results in sorted(round_based_algorithm_results.items()):\n",
76 | " tier1_vps = int(tier1_vps)\n",
77 | " n_vps_cdf = [r[2] + tier1_vps for r in results if r[2] is not None]\n",
78 | " print(tier1_vps, 3 * sum(n_vps_cdf))"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "## Number of landmarks within a certain radius"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "### Figure 5.b of the replication paper"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 3,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "name": "stdout",
102 | "output_type": "stream",
103 | "text": [
104 | "Found 78.128.211.119 with a landmark in the same /24\n",
105 | "Found 77.109.180.62 with a landmark in the same /24\n",
106 | "Found 103.143.136.43 with a landmark in the same /24\n"
107 | ]
108 | }
109 | ],
110 | "source": [
111 | "data = load_json(ANALYZABLE_FILE)\n",
112 | "\n",
113 | "valid_landmarks_count = 0\n",
114 | "unvalid_landmarks_count = 0\n",
115 | "same_asn_lst = []\n",
116 | "same_24_lst = []\n",
117 | "same_bgp_lst = []\n",
118 | "distances_to_landmarks = []\n",
119 | "all_landmarks = []\n",
120 | "asndb = pyasn.pyasn(str(IP_TO_ASN_FILE))\n",
121 | "bgp_prefixes = get_all_bgp_prefixes()\n",
122 | "\n",
123 | "for _, d in data.items():\n",
124 | " same_asn = 0\n",
125 | " diff_asn = 0\n",
126 | " same_bgp = 0\n",
127 | " diff_bgp = 0\n",
128 | " same_24 = 0\n",
129 | " diff_24 = 0\n",
130 | " all_landmarks.append(0)\n",
131 | " if \"tier2:cdn_count\" in d and \"tier2:landmark_count\" in d and \"tier2:failed_header_test_count\" in d:\n",
132 | " all_landmarks[-1] += d['tier2:landmark_count'] + \\\n",
133 | " d['tier2:cdn_count'] + d['tier2:failed_header_test_count']\n",
134 | " valid_landmarks_count += d['tier2:landmark_count']\n",
135 | " unvalid_landmarks_count += d['tier2:cdn_count'] + \\\n",
136 | " d['tier2:failed_header_test_count']\n",
137 | " if \"tier3:cdn_count\" in d and \"tier3:landmark_count\" in d and \"tier3:failed_header_test_count\" in d:\n",
138 | " all_landmarks[-1] += d['tier3:landmark_count'] + \\\n",
139 | " d['tier3:cdn_count'] + d['tier3:failed_header_test_count']\n",
140 | " valid_landmarks_count += d['tier3:landmark_count']\n",
141 | " unvalid_landmarks_count += d['tier3:cdn_count'] + \\\n",
142 | " d['tier3:failed_header_test_count']\n",
143 | " for f in ['tier2:traceroutes', 'tier3:traceroutes']:\n",
144 | " if f in d:\n",
145 | " for t in d[f]:\n",
146 | "\n",
147 | " ipt = t[1]\n",
148 | " ipl = t[2]\n",
149 | " asnt = asndb.lookup(ipt)[0]\n",
150 | " asnl = asndb.lookup(ipl)[0]\n",
151 | " if asnl != None and asnt != None:\n",
152 | " if asnt == asnl:\n",
153 | " same_asn += 1\n",
154 | " else:\n",
155 | " diff_asn += 1\n",
156 | "\n",
157 | " nt = ip_network(ipt+\"/24\", strict=False).network_address\n",
158 | " nl = ip_network(ipl+\"/24\", strict=False).network_address\n",
159 | " if nt == nl:\n",
160 | " same_24 += 1\n",
161 | " else:\n",
162 | " diff_24 += 1\n",
163 | "\n",
164 | " if is_same_bgp_prefix(ipt, ipl, bgp_prefixes):\n",
165 | " same_bgp += 1\n",
166 | " else:\n",
167 | " diff_bgp += 1\n",
168 | "\n",
169 | " distances = []\n",
170 | " for f in ['tier2:landmarks', 'tier3:landmarks']:\n",
171 | " target_geo = (d['RIPE:lat'], d['RIPE:lon'])\n",
172 | " if f in d:\n",
173 | " for l in d[f]:\n",
174 | " landmark_geo = (l[2], l[3])\n",
175 | " distances.append(haversine(target_geo, landmark_geo))\n",
176 | " distances_to_landmarks.append(distances)\n",
177 | "\n",
178 | " if same_asn != 0 or diff_asn != 0:\n",
179 | " same_asn_lst.append(same_asn/(same_asn+diff_asn))\n",
180 | "\n",
181 | " if same_24 != 0 or diff_24 != 0:\n",
182 | " same_24_lst.append(same_24/(same_24+diff_24))\n",
183 | " if same_24 != 0:\n",
184 | " print(\n",
185 | " f\"Found {d['target_ip']} with a landmark in the same /24\")\n",
186 | " if same_bgp != 0 or diff_bgp != 0:\n",
187 | " same_bgp_lst.append(same_bgp/(diff_bgp+same_bgp))"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 4,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stdout",
197 | "output_type": "stream",
198 | "text": [
199 | "713 target have potentail landmarks or 0.9861687413554634\n",
200 | "677 target have valid landmarks or 0.9363762102351314\n",
201 | "207 target with a landmark within 1 km or 0.2863070539419087\n",
202 | "419 target with a landmark within 5 km or 0.5795297372060858\n",
203 | "464 target with a landmark within 10 km or 0.6417704011065007\n",
204 | "552 target with a landmark within 40 km or 0.7634854771784232\n"
205 | ]
206 | }
207 | ],
208 | "source": [
209 | "landmarks_all = []\n",
210 | "landmarks_less_1 = []\n",
211 | "landmarks_less_5 = []\n",
212 | "landmarks_less_10 = []\n",
213 | "landmarks_less_40 = []\n",
214 | "\n",
215 | "for landmark_distances in distances_to_landmarks:\n",
216 | " landmarks_all.append(len(landmark_distances))\n",
217 | " landmarks_less_1.append(len([i for i in landmark_distances if i <= 1]))\n",
218 | " landmarks_less_5.append(len([i for i in landmark_distances if i <= 5]))\n",
219 | " landmarks_less_10.append(\n",
220 | " len([i for i in landmark_distances if i <= 10]))\n",
221 | " landmarks_less_40.append(\n",
222 | " len([i for i in landmark_distances if i <= 40]))\n",
223 | "\n",
224 | "lm_a_0 = len([i for i in all_landmarks if i > 0])\n",
225 | "lmv_a_0 = len([i for i in landmarks_all if i > 0])\n",
226 | "lm1_0 = len([i for i in landmarks_less_1 if i > 0])\n",
227 | "lm5_0 = len([i for i in landmarks_less_5 if i > 0])\n",
228 | "lm10_0 = len([i for i in landmarks_less_10 if i > 0])\n",
229 | "lm40_0 = len([i for i in landmarks_less_40 if i > 0])\n",
230 | "\n",
231 | "\n",
232 | "len_all = len(data)\n",
233 | "print(f\"{lm_a_0} target have potentail landmarks or {lm_a_0/len_all}\")\n",
234 | "print(f\"{lmv_a_0} target have valid landmarks or {lmv_a_0/len_all}\")\n",
235 | "print(f\"{lm1_0} target with a landmark within 1 km or {lm1_0/len_all}\")\n",
236 | "print(f\"{lm5_0} target with a landmark within 5 km or {lm5_0/len_all}\")\n",
237 | "print(f\"{lm10_0} target with a landmark within 10 km or {lm10_0/len_all}\")\n",
238 | "print(f\"{lm40_0} target with a landmark within 40 km or {lm40_0/len_all}\")"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 5,
244 | "metadata": {},
245 | "outputs": [
246 | {
247 | "name": "stderr",
248 | "output_type": "stream",
249 | "text": [
250 | "2023-09-14 13:19:51::INFO:root:analysis:: Tier1 Failed\n"
251 | ]
252 | },
253 | {
254 | "name": "stdout",
255 | "output_type": "stream",
256 | "text": [
257 | "207 targets with landmarks (ping <= 1) or 0.2863070539419087\n",
258 | "419 targets with landmarks (ping <= 5) or 0.5795297372060858\n",
259 | "464 targets with landmarks (ping <= 10) or 0.6417704011065007\n",
260 | "552 targets with landmarks (ping <= 40) or 0.7634854771784232\n",
261 | "723 targets with landmarks (ping <= 9999999999) or 1.0\n"
262 | ]
263 | }
264 | ],
265 | "source": [
266 | "clickhouse_driver = Clickhouse()\n",
267 | "query = clickhouse_driver.get_min_rtt_per_src_dst_prefix_query(TARGET_TO_LANDMARKS_PING_TABLE, filter=\"\", threshold=1000000)\n",
268 | "db_table = clickhouse_driver.execute(query)\n",
269 | "\n",
270 | "rtts = []\n",
271 | "remove_dict = {}\n",
272 | "for l in db_table:\n",
273 | " rtts.append(l[2])\n",
274 | " remove_dict[(l[0], l[1])] = l[2]\n",
275 | "\n",
276 | "error1 = []\n",
277 | "error2 = []\n",
278 | "error3 = []\n",
279 | "error4 = []\n",
280 | "error1ms = []\n",
281 | "error2ms = []\n",
282 | "error5ms = []\n",
283 | "error10ms = []\n",
284 | "\n",
285 | "for _, d in data.items():\n",
286 | " errors = every_tier_result_and_errors(d)\n",
287 | " error1.append(errors['error1'])\n",
288 | " error2.append(errors['error2'])\n",
289 | " error3.append(errors['error3'])\n",
290 | " error4.append(errors['error4'])\n",
291 | " err1ms = 50000\n",
292 | " err2ms = 50000\n",
293 | " err5ms = 50000\n",
294 | " err10ms = 50000\n",
295 | " for f in ['tier2:landmarks', 'tier3:landmarks']:\n",
296 | " if f in d:\n",
297 | " for l_ip, _, l_lat, l_lon in d[f]:\n",
298 | " dist = haversine((l_lat, l_lon), (d['RIPE:lat'], d['RIPE:lon']))\n",
299 | " key_rtt = (l_ip, d['target_ip'])\n",
300 | " if dist < err1ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 1):\n",
301 | " err1ms = dist\n",
302 | " if dist < err2ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 2):\n",
303 | " err2ms = dist\n",
304 | " if dist < err5ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 5):\n",
305 | " err5ms = dist\n",
306 | " if dist < err10ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 10):\n",
307 | " err10ms = dist\n",
308 | " if err1ms != 50000:\n",
309 | " error1ms.append(err1ms)\n",
310 | " else:\n",
311 | " error1ms.append(error1[-1])\n",
312 | " if err2ms != 50000:\n",
313 | " error2ms.append(err2ms)\n",
314 | " else:\n",
315 | " error2ms.append(error1[-1])\n",
316 | " if err5ms != 50000:\n",
317 | " error5ms.append(err5ms)\n",
318 | " else:\n",
319 | " error5ms.append(error1[-1])\n",
320 | " if err10ms != 50000:\n",
321 | " error10ms.append(err10ms)\n",
322 | " else:\n",
323 | " error10ms.append(error1[-1])\n",
324 | "\n",
325 | "for i in [1, 5, 10, 40, 9999999999]:\n",
326 | " c = len([j for j in error1ms if j <= i])\n",
327 | " print(f\"{c} targets with landmarks (ping <= {i}) or {c/len(error1ms)}\")"
328 | ]
329 | }
330 | ],
331 | "metadata": {
332 | "kernelspec": {
333 | "display_name": "review-8XQ99qZ1-py3.10",
334 | "language": "python",
335 | "name": "python3"
336 | },
337 | "language_info": {
338 | "codemirror_mode": {
339 | "name": "ipython",
340 | "version": 3
341 | },
342 | "file_extension": ".py",
343 | "mimetype": "text/x-python",
344 | "name": "python",
345 | "nbconvert_exporter": "python",
346 | "pygments_lexer": "ipython3",
347 | "version": "3.9.13"
348 | },
349 | "orig_nbformat": 4
350 | },
351 | "nbformat": 4,
352 | "nbformat_minor": 2
353 | }
354 |
--------------------------------------------------------------------------------
/clickhouse_files/init-db.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | clickhouse client -n <<-EOSQL
5 | CREATE DATABASE IF NOT EXISTS geolocation_replication;
6 | EOSQL
7 |
--------------------------------------------------------------------------------
/clickhouse_files/users.d/default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 1
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/default.py:
--------------------------------------------------------------------------------
1 | """All the reference paths to storing files settings and constants"""
2 |
3 | from pathlib import Path
4 |
5 | # Default path
6 | DEFAULT_DIR: Path = Path(__file__).resolve().parent
7 |
8 |
9 | ##################################################################################################
10 | # CONSTANTS #
11 | ##################################################################################################
12 | THRESHOLD_DISTANCES = [0, 40, 100, 500, 1000]
13 | SPEED_OF_LIGHT = 300000
14 | SPEED_OF_INTERNET = SPEED_OF_LIGHT * 2 / 3
15 |
16 |
17 | # Atlas path
18 | ATLAS_PATH: Path = DEFAULT_DIR / "datasets/atlas/"
19 | ##################################################################################################
20 | # REPRODUCIBILITY DATASET FILES (static) #
21 | ##################################################################################################
22 | REPRO_PATH: Path = DEFAULT_DIR / "datasets/reproducibility_datasets/"
23 | REPRO_ATLAS_PATH: Path = REPRO_PATH / "atlas/"
24 | REPRO_GENERATED_PATH: Path = REPRO_PATH / "generated/"
25 |
26 | REPRO_ANCHORS_FILE: Path = REPRO_ATLAS_PATH / "reproducibility_anchors.json"
27 | REPRO_PROBES_FILE: Path = REPRO_ATLAS_PATH / "reproducibility_probes.json"
28 | REPRO_PROBES_AND_ANCHORS_FILE: Path = (
29 | REPRO_ATLAS_PATH / "reproducibility_probes_and_anchors.json"
30 | )
31 |
32 | REPRO_PAIRWISE_DISTANCE_FILE: Path = (
33 | REPRO_GENERATED_PATH / "reproducibility_pairwise_distance_ripe_probes.json"
34 | )
35 | REPRO_REMOVED_PROBES_FILE: Path = (
36 | REPRO_GENERATED_PATH / "reproducibility_removed_probes.json"
37 | )
38 | REPRO_FILTERED_PROBES_FILE: Path = (
39 | REPRO_GENERATED_PATH / "reproducibility_filtered_probes.json"
40 | )
41 | REPRO_GREEDY_PROBES_FILE: Path = (
42 | REPRO_GENERATED_PATH / "reproducibility_greedy_probes.json"
43 | )
44 | REPRO_HITLIST_FILE: Path = REPRO_GENERATED_PATH / "reproducibility_parsed_hitlist.json"
45 |
46 |
47 | ##################################################################################################
48 | # USER DATASET FILES (generated) #
49 | ##################################################################################################
50 | USER_PATH: Path = DEFAULT_DIR / "datasets/user_datasets/"
51 | USER_ATLAS_PATH: Path = USER_PATH / "atlas/"
52 | USER_GENERATED_PATH: Path = USER_PATH / "generated/"
53 |
54 | USER_ANCHORS_FILE: Path = USER_ATLAS_PATH / "user_anchors.json"
55 | USER_PROBES_FILE: Path = USER_ATLAS_PATH / "user_probes.json"
56 | USER_PROBES_AND_ANCHORS_FILE: Path = USER_ATLAS_PATH / "user_probes_and_anchors.json"
57 |
58 | USER_PAIRWISE_DISTANCE_FILE: Path = (
59 | USER_GENERATED_PATH / "user_pairwise_distance_ripe_probes.json"
60 | )
61 | USER_REMOVED_PROBES_FILE: Path = USER_GENERATED_PATH / "user_removed_probes.json"
62 | USER_FILTERED_PROBES_FILE: Path = USER_GENERATED_PATH / "user_filtered_probes.json"
63 | USER_GREEDY_PROBES_FILE: Path = USER_GENERATED_PATH / "user_greedy_probes.json"
64 | USER_HITLIST_FILE: Path = USER_GENERATED_PATH / "user_parsed_hitlist.json"
65 |
66 | ##################################################################################################
67 | # CLICKHOUSE SETTINGS #
68 | ##################################################################################################
69 | CLICKHOUSE_CLIENT = DEFAULT_DIR / "clickhouse_files/clickhouse"
70 | CLICKHOUSE_HOST = "localhost"
71 | CLICKHOUSE_DB = "geolocation_replication"
72 | CLICKHOUSE_USER = "default"
73 | CLICKHOUSE_PASSWORD = ""
74 |
75 | # tables to store reproduction results
76 | ANCHORS_MESHED_PING_TABLE = "anchors_meshed_pings"
77 | ANCHORS_TO_PREFIX_TABLE = "anchors_to_prefix_pings"
78 | PROBES_TO_PREFIX_TABLE = "probes_to_prefix_pings"
79 | TARGET_TO_LANDMARKS_PING_TABLE = "targets_to_landmarks_pings"
80 | PROBES_TO_ANCHORS_PING_TABLE = "ping_10k_to_anchors"
81 | ANCHORS_MESHED_TRACEROUTE_TABLE = "anchors_meshed_traceroutes"
82 | STREET_LEVEL_TRACEROUTES_TABLE = "street_lvl_traceroutes"
83 |
84 | # tables to store user measurements
85 | USER_VPS_TO_PREFIX_TABLE = "user_vps_to_prefix"
86 | USER_VPS_TO_TARGET_TABLE = "user_vps_to_target"
87 |
88 | USER_TARGET_TO_LANDMARKS_PING_TABLE = "user_targets_to_landmarks_pings"
89 | USER_ANCHORS_MESHED_TRACEROUTE_TABLE = "user_anchors_meshed_traceroutes"
90 | USER_STREET_LEVEL_TRACEROUTES_TABLE = "user_street_lvl_traceroutes"
91 |
92 | # reproduction results files
93 | CLICKHOUSE_STATIC_DATASET: Path = DEFAULT_DIR / "datasets/clickhouse_data"
94 |
95 | ANCHORS_MESHED_PING_FILE = (
96 | CLICKHOUSE_STATIC_DATASET / f"{ANCHORS_MESHED_PING_TABLE}.zst"
97 | )
98 | ANCHORS_TO_PREFIX_FILE = CLICKHOUSE_STATIC_DATASET / f"{ANCHORS_TO_PREFIX_TABLE}.zst"
99 | PROBES_TO_PREFIX_FILE = CLICKHOUSE_STATIC_DATASET / f"{PROBES_TO_PREFIX_TABLE}.zst"
100 | TARGET_TO_LANDMARKS_PING_FILE = (
101 | CLICKHOUSE_STATIC_DATASET / f"{TARGET_TO_LANDMARKS_PING_TABLE}.zst"
102 | )
103 | PROBES_TO_ANCHORS_PING_FILE = (
104 | CLICKHOUSE_STATIC_DATASET / f"{PROBES_TO_ANCHORS_PING_TABLE}.zst"
105 | )
106 | ANCHORS_MESHED_TRACEROUTE_FILE = (
107 | CLICKHOUSE_STATIC_DATASET / f"{ANCHORS_MESHED_TRACEROUTE_TABLE}.zst"
108 | )
109 | STREET_LEVEL_TRACEROUTES_FILE = (
110 | CLICKHOUSE_STATIC_DATASET / f"{STREET_LEVEL_TRACEROUTES_TABLE}.zst"
111 | )
112 |
113 |
114 | ##################################################################################################
115 | # RIPE ATLAS VPS BIAS ANALYSIS #
116 | ##################################################################################################
117 | ASNS_TYPES: Path = DEFAULT_DIR / "datasets/asns_types"
118 | ASNS_TYPE_CAIDA: Path = ASNS_TYPES / "caida_enhanced_as_type.json"
119 | ASNS_TYPE_STANFORD: Path = ASNS_TYPES / "AS_categories_stanford.json"
120 |
121 |
122 | ##################################################################################################
123 | # STATIC FILES #
124 | ##################################################################################################
125 | STATIC_PATH: Path = DEFAULT_DIR / "datasets/static_datasets/"
126 |
127 | COUNTRIES_JSON_FILE: Path = STATIC_PATH / "countries.json"
128 | COUNTRIES_TXT_FILE: Path = STATIC_PATH / "countries.txt"
129 | COUNTRIES_CSV_FILE: Path = STATIC_PATH / "iso_code_2.csv"
130 | POPULATION_CITY_FILE: Path = STATIC_PATH / "population.json"
131 | CITIES_500_FILE: Path = STATIC_PATH / "cities500.txt"
132 | POPULATION_DENSITY_FILE: Path = (
133 | STATIC_PATH / "gpw_v4_population_density_rev11_2020_30_sec.tif"
134 | )
135 |
136 | ADDRESS_FILE: Path = (
137 | STATIC_PATH / "internet_address_verfploeter_hitlist_it102w-20230125.fsdb"
138 | )
139 | GEOLITE_FILE: Path = STATIC_PATH / "GeoLite2-City-Blocks-IPv4_20230516.tree"
140 | IP_INFO_GEO_FILE: Path = STATIC_PATH / "ip_info_geo_anchors.json"
141 | MAXMIND_GEO_FILE: Path = STATIC_PATH / "maxmind_free_geo_anchors.json"
142 |
143 | GEOPAPIFY_1_FILE: Path = STATIC_PATH / "geocoded_by_geoapify-10_05_2023_0_500.csv"
144 | GEOPAPIFY_2_FILE: Path = STATIC_PATH / "geocoded_by_geoapify-10_05_2023_500_last.csv"
145 |
146 | IP_TO_ASN_FILE: Path = STATIC_PATH / "2022-03-28.dat"
147 | ANCHORS_SECOND_PAPER_FILE: Path = STATIC_PATH / "anchors_ip_list.json"
148 | CACHED_WEBSITES_FILE: Path = STATIC_PATH / "websites.json"
149 | BGP_PRIFIXES_FILE: Path = STATIC_PATH / "bgp_prefixes.json"
150 |
151 | ##################################################################################################
152 | # ANALYSIS RESULTS FILES #
153 | ##################################################################################################
154 |
155 | # REPRODUCIBILITY
156 | REPRO_ANALYSIS_PATH: Path = DEFAULT_DIR / "analysis/results/reproducibility/"
157 |
158 | REPRO_PROBES_TO_ANCHORS_RESULT_FILE: Path = (
159 | REPRO_ANALYSIS_PATH / "cbg_thresholds_probes_to_anchors.json"
160 | )
161 | REPRO_VP_SELECTION_ALGORITHM_PROBES_1_FILE: Path = (
162 | REPRO_ANALYSIS_PATH / "vp_selection_algorithm_probes_1.json"
163 | )
164 | REPRO_VP_SELECTION_ALGORITHM_PROBES_3_FILE: Path = (
165 | REPRO_ANALYSIS_PATH / "vp_selection_algorithm_probes_3.json"
166 | )
167 | REPRO_VP_SELECTION_ALGORITHM_PROBES_10_FILE: Path = (
168 | REPRO_ANALYSIS_PATH / "vp_selection_algoxrithm_probes_10.json"
169 | )
170 | REPRO_ACCURACY_VS_N_VPS_PROBES_FILE: Path = (
171 | REPRO_ANALYSIS_PATH / "accuracy_vs_n_vps_probes.json"
172 | )
173 | REPRO_ROUND_BASED_ALGORITHM_FILE: Path = (
174 | REPRO_ANALYSIS_PATH / "round_based_algorithm_error_cdf.json"
175 | )
176 |
177 | # FROM USER MEASUREMENTS
178 | USER_ANALYSIS_PATH: Path = DEFAULT_DIR / "analysis/results/user/"
179 |
180 | USER_PROBES_TO_ANCHORS_RESULT_FILE: Path = (
181 | USER_ANALYSIS_PATH / "cbg_thresholds_probes_to_anchors.json"
182 | )
183 | USER_VP_SELECTION_ALGORITHM_PROBES_1_FILE: Path = (
184 | USER_ANALYSIS_PATH / "vp_selection_algorithm_probes_1.json"
185 | )
186 | USER_VP_SELECTION_ALGORITHM_PROBES_3_FILE: Path = (
187 | USER_ANALYSIS_PATH / "vp_selection_algorithm_probes_3.json"
188 | )
189 | USER_VP_SELECTION_ALGORITHM_PROBES_10_FILE: Path = (
190 | USER_ANALYSIS_PATH / "vp_selection_algoxrithm_probes_10.json"
191 | )
192 | USER_ACCURACY_VS_N_VPS_PROBES_FILE: Path = (
193 | USER_ANALYSIS_PATH / "accuracy_vs_n_vps_probes.json"
194 | )
195 | USER_ROUND_BASED_ALGORITHM_FILE: Path = (
196 | USER_ANALYSIS_PATH / "round_based_algorithm_error_cdf.json"
197 | )
198 |
199 | ##################################################################################################
200 | # MEASUREMENTS RESULTS FILES #
201 | ##################################################################################################
202 | MEASUREMENTS_MILLION_SCALE_PATH: Path = (
203 | DEFAULT_DIR / "measurements/results/million_scale/"
204 | )
205 | MEASUREMENTS_STREET_LEVEL_PATH: Path = (
206 | DEFAULT_DIR / "measurements/results/street_level/"
207 | )
208 | MEASUREMENT_CONFIG_PATH: Path = (
209 | DEFAULT_DIR / "measurements/results/million_scale/measurement_config/"
210 | )
211 |
212 | ############## MILLION SCALE FILES
213 | PREFIX_MEASUREMENT_RESULTS: Path = (
214 | MEASUREMENTS_MILLION_SCALE_PATH / "prefix_measurement_results.json"
215 | )
216 | TARGET_MEASUREMENT_RESULTS: Path = (
217 | MEASUREMENTS_MILLION_SCALE_PATH / "target_measurement_results.json"
218 | )
219 |
220 | ############## STREET LEVEL FILES
221 | ANALYZABLE_FILE: Path = MEASUREMENTS_STREET_LEVEL_PATH / "all_res.json"
222 |
223 |
224 | ##################################################################################################
225 | # FIGURES FILES #
226 | ##################################################################################################
227 |
228 | # REPRODUCIBILITY
229 | REPRO_FIGURE_PATH: Path = DEFAULT_DIR / "analysis/figures/reproducibility"
230 |
231 | REPRO_GEO_DATABASE_FILE: Path = REPRO_FIGURE_PATH / "geo_databases.pdf"
232 | REPRO_ACCURACY_VS_NB_VPS_FILE: Path = REPRO_FIGURE_PATH / "accuracy_vs_n_vps_probes.pdf"
233 | REPRO_ACCURACY_VS_SUBSET_SIZES_FILE: Path = (
234 | REPRO_FIGURE_PATH / "accuracy_vs_subset_sizes.pdf"
235 | )
236 | REPRO_CBG_THRESHOLD_PROBES_FILE: Path = REPRO_FIGURE_PATH / "cbg_thresholds_probes.pdf"
237 | REPRO_CBG_THRESHOLD_VP_SELECTION_FILE: Path = (
238 | REPRO_FIGURE_PATH / "cbg_thresholds_vp_selection.pdf"
239 | )
240 | REPRO_CBG_THRESHOLD_CONTINENT_FILE: Path = (
241 | REPRO_FIGURE_PATH / "cbg_thresholds_continent.pdf"
242 | )
243 | REPRO_ROUND_ALGORITHM_ERROR_FILE: Path = REPRO_FIGURE_PATH / "round_algorithm_error.pdf"
244 | REPRO_CLOSE_LANDMARK_FILE: Path = REPRO_FIGURE_PATH / "cdf_close_landmark_check_log.pdf"
245 | REPRO_INVALID_RTT_FILE: Path = REPRO_FIGURE_PATH / "invalid_rtt.pdf"
246 | REPRO_TIME_TO_GEOLOCATE_FILE: Path = REPRO_FIGURE_PATH / "cdf_time_to_geolocate.pdf"
247 | REPRO_SCATTER_DISTANCE_FILE: Path = REPRO_FIGURE_PATH / "scatter_md_vs_d.pdf"
248 | REPRO_SCATTER_DENSITY_FILE: Path = REPRO_FIGURE_PATH / "scatter_density.pdf"
249 | REPRO_CDF_DENSITY_FILE: Path = REPRO_FIGURE_PATH / "cdf_density.pdf"
250 |
251 | # FROM USER MEASUREMENTS
252 | USER_FIGURE_PATH: Path = DEFAULT_DIR / "analysis/figures/user"
253 |
254 | REPRO_GEO_DATABASE_FILE: Path = USER_FIGURE_PATH / "geo_databases.pdf"
255 | USER_ACCURACY_VS_NB_VPS_FILE: Path = USER_FIGURE_PATH / "accuracy_vs_n_vps_probes.pdf"
256 | USER_ACCURACY_VS_SUBSET_SIZES_FILE: Path = (
257 | USER_FIGURE_PATH / "accuracy_vs_subset_sizes.pdf"
258 | )
259 | USER_CBG_THRESHOLD_PROBES_FILE: Path = USER_FIGURE_PATH / "cbg_thresholds_probes.pdf"
260 | USER_CBG_THRESHOLD_VP_SELECTION_FILE: Path = (
261 | USER_FIGURE_PATH / "cbg_thresholds_vp_selection.pdf"
262 | )
263 | USER_CBG_THRESHOLD_CONTINENT_FILE: Path = (
264 | USER_FIGURE_PATH / "cbg_thresholds_continent.pdf"
265 | )
266 | USER_ROUND_ALGORITHM_ERROR_FILE: Path = USER_FIGURE_PATH / "round_algorithm_error.pdf"
267 | USER_CLOSE_LANDMARK_FILE: Path = USER_FIGURE_PATH / "cdf_close_landmark_check_log.pdf"
268 | USER_INVALID_RTT_FILE: Path = USER_FIGURE_PATH / "invalid_rtt.pdf"
269 | USER_TIME_TO_GEOLOCATE_FILE: Path = USER_FIGURE_PATH / "cdf_time_to_geolocate.pdf"
270 | USER_SCATTER_DISTANCE_FILE: Path = USER_FIGURE_PATH / "scatter_md_vs_d.pdf"
271 | USER_SCATTER_DENSITY_FILE: Path = USER_FIGURE_PATH / "scatter_density.pdf"
272 | USER_CDF_DENSITY_FILE: Path = USER_FIGURE_PATH / "cdf_density.pdf"
273 |
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | # pull the docker image
2 | docker pull clickhouse/clickhouse-server:22.6
3 |
4 |
5 | # start the server using docker
6 | docker run --rm -d \
7 | -v ./clickhouse_files/data:/var/lib/clickhouse/ \
8 | -v ./clickhouse_files/logs:/var/log/clickhouse-server/ \
9 | -v ./clickhouse_files/users.d:/etc/clickhouse-server/users.d:ro \
10 | -v ./clickhouse_files/init-db.sh:/docker-entrypoint-initdb.d/init-db.sh \
11 | -p 8123:8123 \
12 | -p 9000:9000 \
13 | --ulimit nofile=262144:262144 \
14 | clickhouse/clickhouse-server:22.6
15 |
16 | # download clickhouse client binary
17 | curl https://clickhouse.com/ | sh
18 | mv clickhouse ./clickhouse_files/
19 |
20 | # install source files
21 | poetry lock
22 | poetry install
23 |
24 | # run clickhouse db installer for table init
25 | poetry run python scripts/utils/clickhouse_installer.py
26 |
--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | logging.basicConfig(
4 | format="%(asctime)s::%(levelname)s:%(name)s:%(module)s:: %(message)s",
5 | level=logging.INFO,
6 | datefmt="%Y-%m-%d %H:%M:%S",
7 | )
8 |
9 | logger = logging.getLogger()
10 |
--------------------------------------------------------------------------------
/measurements/landmark_traceroutes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Probing part 4\n",
8 | "\n",
9 | "Vantage points will probe the targets in a 3-step method, either by doing pings or traceroutes. \n",
10 | "\n",
11 | "Vantage points are the Ripe Atlas anchors, then indireclty some online landmarks. \n",
12 | "As always, targets are the anchors. \n",
13 | "\n",
14 | "This notebook is an implementation of the street level method. Check the paper for more information. \n",
15 | "To do after create_datasets.ipynb"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import traceback\n",
25 | "\n",
26 | "from pprint import pprint\n",
27 | "from clickhouse_driver import Client\n",
28 | "\n",
29 | "from scripts.utils.file_utils import load_json, dump_json\n",
30 | "from scripts.utils.measurement_utils import load_vps\n",
31 | "from scripts.utils.helpers import haversine\n",
32 | "from scripts.street_level.traceroutes_results import serialize\n",
33 | "from scripts.street_level.three_tiers import get_all_info_geoloc\n",
34 | "from default import USER_ANCHORS_FILE, ANALYZABLE_FILE\n",
35 | "\n",
36 | "NB_VP = 10"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "### database for traceroutes"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "### Main\n",
51 | "\n",
52 | "This would take a lot of time (more than a day if you use all the VPs)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# Anchors are the targets and Vantage points\n",
62 | "anchors = load_vps(USER_ANCHORS_FILE, nb_vps=NB_VP)\n",
63 | "try:\n",
64 | " all_res = load_json(ANALYZABLE_FILE)\n",
65 | "except FileNotFoundError:\n",
66 | " all_res = {}\n",
67 | "\n",
68 | "i = 0\n",
69 | "for target in anchors.values():\n",
70 | " try:\n",
71 | " target_ip = target['address_v4']\n",
72 | " if target_ip in all_res: # we skip targets already geolocated\n",
73 | " continue\n",
74 | " print(f\"{i}:{target_ip}\")\n",
75 | " i += 1\n",
76 | "\n",
77 | " res = get_all_info_geoloc(target_ip, vps=anchors.values())\n",
78 | " res = serialize(res)\n",
79 | " # We save the coordinates of the targets as given by RIPE Atlas\n",
80 | " res['RIPE:lat'] = target['geometry']['coordinates'][1]\n",
81 | " res['RIPE:lon'] = target['geometry']['coordinates'][0]\n",
82 | "\n",
83 | " # We save the error of the estimated geolocation at each step\n",
84 | " if res['lat'] != None and res['lon'] != None:\n",
85 | " res['error'] = haversine(\n",
86 | " (res['lat'], res['lon']), (res['RIPE:lat'], res['RIPE:lon']))\n",
87 | " if 'tier1:lat' in res and 'tier1:lon' in res and res['tier1:lat'] != None and res['tier1:lon'] != None:\n",
88 | " res['tier1:error'] = haversine(\n",
89 | " (res['tier1:lat'], res['tier1:lon']), (res['RIPE:lat'], res['RIPE:lon']))\n",
90 | " if 'tier2:lat' in res and 'tier2:lon' in res and res['tier2:lat'] != None and res['tier2:lon'] != None:\n",
91 | " res['tier2:error'] = haversine(\n",
92 | " (res['tier2:lat'], res['tier2:lon']), (res['RIPE:lat'], res['RIPE:lon']))\n",
93 | " if 'tier3:lat' in res and 'tier3:lon' in res and res['tier3:lat'] != None and res['tier3:lon'] != None:\n",
94 | " res['tier3:error'] = haversine(\n",
95 | " (res['tier3:lat'], res['tier3:lon']), (res['RIPE:lat'], res['RIPE:lon']))\n",
96 | "\n",
97 | " all_res[target_ip] = res\n",
98 | " # We save the results\n",
99 | " dump_json(all_res, ANALYZABLE_FILE)\n",
100 | " except Exception:\n",
101 | " traceback.print_exc()"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "### Geolocat one IP"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 6,
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "name": "stdout",
118 | "output_type": "stream",
119 | "text": [
120 | "{'target_ip': '195.83.132.129', 'tier1:done': False, 'tier2:done': False, 'tier3:done': False, 'negative_rtt_included': True, 'speed_threshold': 0.6666666666666666, 'tier1:lat': None, 'tier1:lon': None, 'vps': set(), 'tier1:duration': 1282.0457310676575, 'lat': None, 'lon': None}\n"
121 | ]
122 | }
123 | ],
124 | "source": [
125 | "target_ip = '195.83.132.129' # LAAS/CNRS\n",
126 | "geolocation = get_all_info_geoloc(target_ip)\n",
127 | "#geolocation = geoloc(target_ip)\n",
128 | "print(geolocation)\n",
129 | "geolocation = serialize(geolocation)\n",
130 | "dump_json(geolocation, 'res_tmp.json')"
131 | ]
132 | }
133 | ],
134 | "metadata": {
135 | "kernelspec": {
136 | "display_name": "review-8XQ99qZ1-py3.10",
137 | "language": "python",
138 | "name": "python3"
139 | },
140 | "language_info": {
141 | "codemirror_mode": {
142 | "name": "ipython",
143 | "version": 3
144 | },
145 | "file_extension": ".py",
146 | "mimetype": "text/x-python",
147 | "name": "python",
148 | "nbconvert_exporter": "python",
149 | "pygments_lexer": "ipython3",
150 | "version": "3.9.13"
151 | },
152 | "orig_nbformat": 4
153 | },
154 | "nbformat": 4,
155 | "nbformat_minor": 2
156 | }
157 |
--------------------------------------------------------------------------------
/measurements/million_scale_measurements.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Probing part 1\n",
8 | "\n",
9 | "Vantage points will probe either the targets themselves (step 2) or other addresses in the same /24 prefix (step 2).\n",
10 | "\n",
11 | "Vantage points are only the anchors. \n",
12 | "As always, targets are the anchors. \n",
13 | "\n",
14 | "This notebook is an implementation of the million scale method. Check the paper for more information. \n",
15 | "To do after create_datasets.ipynb"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import uuid\n",
25 | "\n",
26 | "from logger import logger\n",
27 | "from scripts.utils.file_utils import load_json\n",
28 | "from scripts.utils.measurement_utils import (\n",
29 | " load_targets,\n",
30 | " load_vps,\n",
31 | " get_measurement_config,\n",
32 | " save_measurement_config,\n",
33 | " get_target_prefixes,\n",
34 | " ping_prefixes,\n",
35 | " ping_targets,\n",
36 | ")\n",
37 | "from default import (\n",
38 | " USER_ANCHORS_FILE,\n",
39 | " USER_HITLIST_FILE,\n",
40 | " MEASUREMENT_CONFIG_PATH,\n",
41 | ")\n",
42 | "\n",
43 | "# will define the number of vps and targets to use\n",
44 | "NB_TARGETS = 2\n",
45 | "NB_VPS = 4"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "## Load targets and vps dataset"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 2,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "targets = load_targets(USER_ANCHORS_FILE, nb_target=NB_TARGETS)\n",
62 | "vps = load_vps(USER_ANCHORS_FILE, nb_vps=NB_VPS)\n",
63 | "\n",
64 | "# every anchors /24 subnet\n",
65 | "target_addrs = [t[\"address_v4\"] for t in targets]\n",
66 | "target_prefixes = get_target_prefixes(target_addrs)\n",
67 | "\n",
68 | "# responsive IP addresses in all /24 prefixes\n",
69 | "targets_per_prefix = load_json(USER_HITLIST_FILE)\n",
70 | "\n",
71 | "logger.info(f\"nb targets: {len(targets)}\")\n",
72 | "logger.info(f\"nb_vps : {len(vps)}\")"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "## Generate measurement config\n",
80 | "\n",
81 | "This configuration is used to retrieve all measurements results from RIPE Atlas using their API."
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 3,
87 | "metadata": {},
88 | "outputs": [
89 | {
90 | "name": "stderr",
91 | "output_type": "stream",
92 | "text": [
93 | "2024-10-04 13:13:06::INFO:root:3171606573:: Starting experiment with uuid : c78efe35-8089-41a9-9206-ac7bac4a8a68\n"
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "# measurement configuration for retrieval\n",
99 | "experiment_uuid = str(uuid.uuid4())\n",
100 | "target_measurement_uuid = str(uuid.uuid4())\n",
101 | "prefix_measurement_uuid = str(uuid.uuid4())\n",
102 | "\n",
103 | "config_file_path = MEASUREMENT_CONFIG_PATH / f\"{experiment_uuid}.json\"\n",
104 | "\n",
105 | "logger.info(f\"Starting experiment with uuid : {experiment_uuid}\")\n",
106 | "\n",
107 | "measurement_config = get_measurement_config(\n",
108 | " experiment_uuid=experiment_uuid,\n",
109 | " target_measurement_uuid=target_measurement_uuid,\n",
110 | " prefix_measurement_uuid=prefix_measurement_uuid,\n",
111 | " targets=targets,\n",
112 | " target_prefixes=target_prefixes,\n",
113 | " vps=vps,\n",
114 | ")\n",
115 | "\n",
116 | "save_measurement_config(measurement_config, config_file_path)"
117 | ]
118 | },
119 | {
120 | "attachments": {},
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "# Step 1: probing each target prefixes"
125 | ]
126 | },
127 | {
128 | "attachments": {},
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "## Probe target prefixes\n",
133 | "WARNING : Time consumming section"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 4,
139 | "metadata": {},
140 | "outputs": [
141 | {
142 | "name": "stderr",
143 | "output_type": "stream",
144 | "text": [
145 | "2024-10-04 13:13:06::INFO:root:measurement_utils:: No cached results available\n",
146 | "2024-10-04 13:13:06::INFO:root:measurement_utils:: Starting measurements dd2e9428-762d-4353-99ca-613057d430a3 with parameters: dry_run=False; nb_targets=2; nb_vps=4.\n",
147 | "2024-10-04 13:13:06::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942232\n",
148 | "2024-10-04 13:13:08::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942233\n",
149 | "2024-10-04 13:13:08::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942234\n",
150 | "2024-10-04 13:13:08::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942235\n",
151 | "2024-10-04 13:13:09::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942236\n",
152 | "2024-10-04 13:13:09::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942237\n",
153 | "2024-10-04 13:13:09::INFO:root:ping_and_traceroute_classes:: measurement : dd2e9428-762d-4353-99ca-613057d430a3 done\n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "ping_prefixes(\n",
159 | " measurement_uuid=prefix_measurement_uuid,\n",
160 | " measurement_config=measurement_config,\n",
161 | " target_prefixes=target_prefixes,\n",
162 | " targets_per_prefix=targets_per_prefix,\n",
163 | " vps=vps,\n",
164 | ")\n",
165 | "\n",
166 | "save_measurement_config(measurement_config, config_file_path)"
167 | ]
168 | },
169 | {
170 | "attachments": {},
171 | "cell_type": "markdown",
172 | "metadata": {},
173 | "source": [
174 | "# Step 2: probing each target"
175 | ]
176 | },
177 | {
178 | "attachments": {},
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "## Probe targets\n",
183 | "WARNING : Time consumming section"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 6,
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "name": "stderr",
193 | "output_type": "stream",
194 | "text": [
195 | "2024-10-04 13:13:11::INFO:root:measurement_utils:: Starting measurements 6796bfe3-7137-43f1-9f9f-71e0a141157d with parameters: dry_run=False; nb_targets=6; nb_vps=4.\n",
196 | "2024-10-04 13:13:12::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942244\n",
197 | "2024-10-04 13:13:12::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942245\n",
198 | "2024-10-04 13:13:12::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942246\n",
199 | "2024-10-04 13:13:13::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942247\n",
200 | "2024-10-04 13:13:13::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942248\n",
201 | "2024-10-04 13:13:14::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942249\n",
202 | "2024-10-04 13:13:14::INFO:root:ping_and_traceroute_classes:: measurement : 6796bfe3-7137-43f1-9f9f-71e0a141157d done\n"
203 | ]
204 | }
205 | ],
206 | "source": [
207 | "# measurement configuration for retrieval\n",
208 | "ping_targets(\n",
209 | " measurement_uuid=target_measurement_uuid,\n",
210 | " measurement_config=measurement_config,\n",
211 | " targets=targets,\n",
212 | " vps=vps,\n",
213 | " use_cache=False,\n",
214 | ")\n",
215 | "save_measurement_config(measurement_config, config_file_path)"
216 | ]
217 | },
218 | {
219 | "attachments": {},
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "## Retrieve prefix results\n",
224 | "WARNING : Time consuming section\n",
225 | "\n",
226 | "Note: it might take some time before measurement results are available through RIPE API. If no results are available, retry after a few minutes (or hours, it might really depends on the probe itself)."
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 7,
232 | "metadata": {},
233 | "outputs": [],
234 | "source": [
235 | "from logger import logger\n",
236 | "from scripts.utils.file_utils import load_json\n",
237 | "from scripts.utils.measurement_utils import (\n",
238 | " retrieve_results,\n",
239 | " insert_prefix_results,\n",
240 | " insert_target_results,\n",
241 | ")\n",
242 | "from default import (\n",
243 | " PREFIX_MEASUREMENT_RESULTS,\n",
244 | " TARGET_MEASUREMENT_RESULTS,\n",
245 | ")\n",
246 | "\n",
247 | "# will define the number of vps and targets to use\n",
248 | "NB_TARGETS = 2\n",
249 | "NB_VPS = 4"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 8,
255 | "metadata": {},
256 | "outputs": [
257 | {
258 | "name": "stderr",
259 | "output_type": "stream",
260 | "text": [
261 | "2024-10-04 13:13:14::INFO:root:3539837011:: {'experiment_uuid': 'c78efe35-8089-41a9-9206-ac7bac4a8a68', 'status': 'ongoing', 'start_time': '2024-10-04 13:13:06.112516', 'end_time': None, 'is_dry_run': False, 'nb_targets': 2, 'nb_vps': 4, 'description': 'measurements from a set of vps towards all targets/target prefixes', 'af': 4, 'target_measurements': {'measurement_uuid': '6796bfe3-7137-43f1-9f9f-71e0a141157d', 'targets': ['103.196.37.98', '195.246.236.1', '77.220.233.1', '185.230.79.16', '185.34.2.114', '217.25.179.62'], 'vps': {'77.220.233.1': {'id': 6824, 'address_v4': '77.220.233.1', 'asn_v4': 42699, 'country_code': 'DE', 'geometry': {'type': 'Point', 'coordinates': [13.7285, 51.0395]}}, '185.230.79.16': {'id': 7122, 'address_v4': '185.230.79.16', 'asn_v4': 204515, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.1585, 48.7085]}}, '185.34.2.114': {'id': 6798, 'address_v4': '185.34.2.114', 'asn_v4': 36236, 'country_code': 'AE', 'geometry': {'type': 'Point', 'coordinates': [55.8115, 25.6315]}}, '217.25.179.62': {'id': 7042, 'address_v4': '217.25.179.62', 'asn_v4': 24776, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.3695, 48.9085]}}}, 'end_time': 1728047594.2916105, 'start_time': 1728047591.8001034}, 'prefix_measurements': {'measurement_uuid': 'dd2e9428-762d-4353-99ca-613057d430a3', 'targets': ['103.196.37.0', '195.246.236.0'], 'vps': {'77.220.233.1': {'id': 6824, 'address_v4': '77.220.233.1', 'asn_v4': 42699, 'country_code': 'DE', 'geometry': {'type': 'Point', 'coordinates': [13.7285, 51.0395]}}, '185.230.79.16': {'id': 7122, 'address_v4': '185.230.79.16', 'asn_v4': 204515, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.1585, 48.7085]}}, '185.34.2.114': {'id': 6798, 'address_v4': '185.34.2.114', 'asn_v4': 36236, 'country_code': 'AE', 'geometry': {'type': 'Point', 'coordinates': [55.8115, 25.6315]}}, '217.25.179.62': {'id': 7042, 'address_v4': '217.25.179.62', 'asn_v4': 24776, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.3695, 48.9085]}}}, 'end_time': 1728047589.574289, 'start_time': 1728047586.1349247}, 'meshed_measurements': {'measurement_uuid': '805d6778-9e09-4be7-9c43-d4aafc813a10', 'targets': ['103.196.37.98', '195.246.236.1', '77.220.233.1', '185.230.79.16', '185.34.2.114', '217.25.179.62'], 'vps': {'77.220.233.1': {'id': 6824, 'address_v4': '77.220.233.1', 'asn_v4': 42699, 'country_code': 'DE', 'geometry': {'type': 'Point', 'coordinates': [13.7285, 51.0395]}}, '185.230.79.16': {'id': 7122, 'address_v4': '185.230.79.16', 'asn_v4': 204515, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.1585, 48.7085]}}, '185.34.2.114': {'id': 6798, 'address_v4': '185.34.2.114', 'asn_v4': 36236, 'country_code': 'AE', 'geometry': {'type': 'Point', 'coordinates': [55.8115, 25.6315]}}, '217.25.179.62': {'id': 7042, 'address_v4': '217.25.179.62', 'asn_v4': 24776, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.3695, 48.9085]}}}, 'end_time': 1728047591.7847333, 'start_time': 1728047589.5959833}}\n"
262 | ]
263 | }
264 | ],
265 | "source": [
266 | "measurement_config = load_json(config_file_path)\n",
267 | "logger.info(measurement_config)"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 9,
273 | "metadata": {},
274 | "outputs": [
275 | {
276 | "name": "stderr",
277 | "output_type": "stream",
278 | "text": [
279 | "2024-10-04 13:13:14::INFO:root:1680719454:: retrieving results for measurement ids: dd2e9428-762d-4353-99ca-613057d430a3\n",
280 | "2024-10-04 13:13:15::INFO:root:measurement_utils:: nb measurements retrieved: 0 for measurement_uuid : dd2e9428-762d-4353-99ca-613057d430a3\n"
281 | ]
282 | },
283 | {
284 | "ename": "UnboundLocalError",
285 | "evalue": "local variable 'result' referenced before assignment",
286 | "output_type": "error",
287 | "traceback": [
288 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
289 | "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
290 | "Cell \u001b[0;32mIn[9], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# sometimes, not all probes give output, reduce timeout if you do not want to wait for too long\u001b[39;00m\n\u001b[1;32m 6\u001b[0m response \u001b[38;5;241m=\u001b[39m retrieve_results(prefix_measurement_uuid, PREFIX_MEASUREMENT_RESULTS)\n\u001b[0;32m----> 8\u001b[0m \u001b[43minsert_prefix_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresponse\u001b[49m\u001b[43m)\u001b[49m\n",
291 | "File \u001b[0;32m/storage/hugo/geoloc-imc-2023/scripts/utils/measurement_utils.py:324\u001b[0m, in \u001b[0;36minsert_prefix_results\u001b[0;34m(results)\u001b[0m\n\u001b[1;32m 319\u001b[0m values_description \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 320\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msrc, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 321\u001b[0m )\n\u001b[1;32m 323\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m results:\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mno data to insert, data = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mresult\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 326\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m result \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[1;32m 327\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 328\u001b[0m \u001b[38;5;66;03m# parse response\u001b[39;00m\n",
292 | "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'result' referenced before assignment"
293 | ]
294 | }
295 | ],
296 | "source": [
297 | "prefix_measurement_uuid = measurement_config[\"prefix_measurements\"][\"measurement_uuid\"]\n",
298 | "\n",
299 | "logger.info(f\"retrieving results for measurement ids: {prefix_measurement_uuid}\")\n",
300 | "\n",
301 | "# sometimes, not all probes give output, reduce timeout if you do not want to wait for too long\n",
302 | "response = retrieve_results(prefix_measurement_uuid, PREFIX_MEASUREMENT_RESULTS)\n",
303 | "\n",
304 | "insert_prefix_results(response)"
305 | ]
306 | },
307 | {
308 | "attachments": {},
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "## Retrieve traget results\n",
313 | "WARNING : Time consumming section\n",
314 | "\n",
315 | "Note: it might take some time before measurement results are available through RIPE API. If no results are available, retry after a few minutes (or hours, it might really depends on the probe itself)."
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 13,
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "name": "stderr",
325 | "output_type": "stream",
326 | "text": [
327 | "2024-10-03 18:08:53::INFO:root:3802694766:: retrieving results for measurement ids: 18020ef4-fcc5-410b-9eb1-9ab3a18dd3a3\n",
328 | "2024-10-03 18:08:53::INFO:root:measurement_utils:: nb measurements retrieved: 20 for measurement_uuid : 18020ef4-fcc5-410b-9eb1-9ab3a18dd3a3\n",
329 | "2024-10-03 18:08:53::INFO:root:measurement_utils:: Target measurements successfully inserted in table : user_vps_to_target\n"
330 | ]
331 | }
332 | ],
333 | "source": [
334 | "target_measurement_uuid = measurement_config[\"target_measurements\"][\"measurement_uuid\"]\n",
335 | "\n",
336 | "logger.info(f\"retrieving results for measurement ids: {target_measurement_uuid}\")\n",
337 | "\n",
338 | "response = retrieve_results(target_measurement_uuid, TARGET_MEASUREMENT_RESULTS)\n",
339 | "\n",
340 | "insert_target_results(response)"
341 | ]
342 | }
343 | ],
344 | "metadata": {
345 | "kernelspec": {
346 | "display_name": "geoloc-imc-2023-GZT64Hva-py3.10",
347 | "language": "python",
348 | "name": "python3"
349 | },
350 | "language_info": {
351 | "codemirror_mode": {
352 | "name": "ipython",
353 | "version": 3
354 | },
355 | "file_extension": ".py",
356 | "mimetype": "text/x-python",
357 | "name": "python",
358 | "nbconvert_exporter": "python",
359 | "pygments_lexer": "ipython3",
360 | "version": "3.10.12"
361 | },
362 | "orig_nbformat": 4
363 | },
364 | "nbformat": 4,
365 | "nbformat_minor": 2
366 | }
367 |
--------------------------------------------------------------------------------
/measurements/million_scale_measurements.py:
--------------------------------------------------------------------------------
1 | """perform a meshed ping measurement where each VP is probed by every others"""
2 |
3 | from logger import logger
4 |
5 | from scripts.utils.file_utils import load_json
6 | from scripts.utils.measurement_utils import (
7 | load_targets,
8 | load_vps,
9 | get_measurement_config,
10 | save_measurement_config,
11 | get_target_prefixes,
12 | ping_prefixes,
13 | ping_targets,
14 | retrieve_results,
15 | insert_prefix_results,
16 | insert_target_results,
17 | )
18 | from default import (
19 | USER_ANCHORS_FILE,
20 | USER_HITLIST_FILE,
21 | PREFIX_MEASUREMENT_RESULTS,
22 | TARGET_MEASUREMENT_RESULTS,
23 | MEASUREMENT_CONFIG_PATH,
24 | )
25 |
26 | # Small number of targets and VPs for testing
27 | # Change to real Anchors and VPs values for complete measurement
28 | NB_TARGETS = 5
29 | NB_VPS = 10
30 |
31 | # measurement configuration for retrieval,
32 | # replace if you want to create new batch of measurements
33 | EXPERIMENT_UUID = "3992e46c-73cf-4a7b-9428-3198856039a9"
34 | TARGET_MESAUREMENT_UUID = "03eb9559-88fe-41cb-b62c-4c07d1d5acb8"
35 | PREFIX_MESAUREMENT_UUID = "a09709aa-be76-4687-852e-64e8090bee70"
36 | CONFIG_PATH = MEASUREMENT_CONFIG_PATH / f"{EXPERIMENT_UUID}.json"
37 |
38 |
39 | def main_measurements() -> None:
40 | """perform all measurements related with million scale"""
41 | # set any of these var to execute the corresponding fct
42 | do_target_pings = True
43 | do_target_prefix_pings = True
44 |
45 | # load targets and VPs
46 | targets = load_targets(USER_ANCHORS_FILE, nb_target=NB_TARGETS)
47 | vps = load_vps(USER_ANCHORS_FILE, nb_vps=NB_VPS)
48 |
49 | # every anchors /24 subnet
50 | target_addrs = [t["address_v4"] for t in targets]
51 | target_prefixes = get_target_prefixes(target_addrs)
52 | # responsive IP addresses in all /24 prefixes
53 | targets_per_prefix = load_json(USER_HITLIST_FILE)
54 |
55 | logger.info(f"Starting experiment with uuid :: {EXPERIMENT_UUID}")
56 | logger.info(f"Config output :: {CONFIG_PATH}")
57 |
58 | # check if measurements measurement under this config uuid already exists
59 | if CONFIG_PATH.exists():
60 | logger.info(f"Loading existing measurement config:: {EXPERIMENT_UUID}")
61 | measurement_config = load_json(CONFIG_PATH)
62 | else:
63 | # create a new config is no existing one
64 | measurement_config = get_measurement_config(
65 | targets=targets,
66 | vps=vps,
67 | target_prefixes=target_prefixes,
68 | experiment_uuid=EXPERIMENT_UUID,
69 | target_measurement_uuid=TARGET_MESAUREMENT_UUID,
70 | prefix_measurement_uuid=PREFIX_MESAUREMENT_UUID,
71 | )
72 | save_measurement_config(measurement_config, CONFIG_PATH)
73 |
74 | if do_target_pings:
75 | vps.extend(targets)
76 |
77 | logger.info(f"Starting targets pigns :: {TARGET_MESAUREMENT_UUID}")
78 | logger.info(f"Nb targets :: {len(targets)}")
79 | logger.info(f"Nb vps :: {len(vps)}")
80 |
81 | # measurement configuration for retrieval
82 | ping_targets(
83 | measurement_uuid=TARGET_MESAUREMENT_UUID,
84 | measurement_config=measurement_config,
85 | targets=targets,
86 | vps=vps,
87 | use_cache=True,
88 | )
89 |
90 | # update config
91 | save_measurement_config(measurement_config, CONFIG_PATH)
92 |
93 | if do_target_prefix_pings:
94 | logger.info(f"Starting prefix pigns :: {PREFIX_MESAUREMENT_UUID}")
95 | logger.info(f"Nb targets :: {len(targets)}")
96 | logger.info(f"Nb prefixes :: {len(target_prefixes)}")
97 | logger.info(f"Nb vps :: {len(vps)}")
98 |
99 | ping_prefixes(
100 | vps=vps,
101 | target_prefixes=target_prefixes,
102 | targets_per_prefix=targets_per_prefix,
103 | measurement_uuid=PREFIX_MESAUREMENT_UUID,
104 | measurement_config=measurement_config,
105 | )
106 |
107 |
108 | def main_retrieve_results() -> None:
109 | """retrieve all measurement results related with million scale"""
110 | retrieve_target_measurements = True
111 | retrieve_prefix_measurements = True
112 |
113 | measurement_config = load_json(CONFIG_PATH)
114 | logger.info(f"{measurement_config}")
115 |
116 | if retrieve_target_measurements:
117 | target_measurement_uuid = measurement_config["target_measurements"][
118 | "measurement_uuid"
119 | ]
120 |
121 | logger.info(
122 | f"retrieving results for measurement ids: {target_measurement_uuid}"
123 | )
124 |
125 | # sometimes, not all probes give output, reduce timeout if you do not want to wait for too long
126 | response = retrieve_results(TARGET_MESAUREMENT_UUID, TARGET_MEASUREMENT_RESULTS)
127 |
128 | # will output into user tables
129 | insert_target_results(response)
130 |
131 | if retrieve_prefix_measurements:
132 | prefix_measurement_uuid = measurement_config["prefix_measurements"][
133 | "measurement_uuid"
134 | ]
135 |
136 | logger.info(
137 | f"retrieving results for measurement ids: {prefix_measurement_uuid}"
138 | )
139 |
140 | # sometimes, not all probes give output, reduce timeout if you do not want to wait for too long
141 | response = retrieve_results(TARGET_MESAUREMENT_UUID, PREFIX_MEASUREMENT_RESULTS)
142 |
143 | # will output into user tables
144 | insert_prefix_results(response)
145 |
146 |
147 | if __name__ == "__main__":
148 | do_measurements = True
149 | do_retrieve_results = True
150 |
151 | if do_measurements:
152 | main_measurements()
153 |
154 | if do_retrieve_results:
155 | main_retrieve_results()
156 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "geoscale"
3 | version = "1.0"
4 | description = "Geolocation reproduction paper"
5 | authors = ["Danaelmilo "]
6 | readme = "README.md"
7 | packages = [
8 | { include = "scripts" },
9 | ]
10 |
11 | [tool.poetry.dependencies]
12 | python = "^3.9"
13 | numpy = "^1.21.3"
14 | matplotlib = "^3.4.3"
15 | requests = "^2.17.0"
16 | clickhouse-driver = "^0.2.6"
17 | overpy = "^0.6"
18 | dnspython = "^2.4.1"
19 | geopy = "^2.3.0"
20 | ujson = "^5.8.0"
21 | scipy = "^1.5.0"
22 | geopandas = "^0.13.2"
23 | rasterio = "^1.3.8"
24 | ipykernel = "^6.24.0"
25 | jupyter = "^1.0.0"
26 | py-radix = "^0.10.0"
27 | pyasn = "^1.6.2"
28 | clickhouse_driver = "^0.2.2"
29 | python-dotenv="^0.20.0"
30 |
31 | [tool.poetry.dev-dependencies]
32 | ipykernel = "^6.25.1"
33 |
34 | [build-system]
35 | requires = ["poetry-core"]
36 | build-backend = "poetry.core.masonry.api"
--------------------------------------------------------------------------------
/scripts/ripe_atlas/atlas_api.py:
--------------------------------------------------------------------------------
1 | # All functions to query RIPE Atlas API
2 |
3 | import json
4 | import time
5 | import requests
6 | import ipaddress
7 |
8 | from collections import defaultdict, OrderedDict
9 | from ipaddress import IPv4Network
10 | from random import randint
11 |
12 | from logger import logger
13 |
14 |
15 | class RIPEAtlas(object):
16 | def __init__(
17 | self,
18 | account: str,
19 | key: str,
20 | ) -> None:
21 | self.account = account
22 | self.key = key
23 |
24 | def ping(
25 | self, target, vps, tag: str, nb_packets: int = 3, max_retry: int = 60
26 | ) -> None:
27 | """start ping measurement towards target from vps, return Atlas measurement id"""
28 |
29 | for _ in range(max_retry):
30 | response = requests.post(
31 | f"https://atlas.ripe.net/api/v2/measurements/?key={self.key}",
32 | json={
33 | "definitions": [
34 | {
35 | "target": target,
36 | "af": 4,
37 | "packets": nb_packets,
38 | "size": 48,
39 | "tags": [tag],
40 | "description": f"Dioptra Geolocation of {target}",
41 | "resolve_on_probe": False,
42 | "skip_dns_check": True,
43 | "include_probe_id": False,
44 | "type": "ping",
45 | }
46 | ],
47 | "probes": [
48 | {"value": vp, "type": "probes", "requested": 1} for vp in vps
49 | ],
50 | "is_oneoff": True,
51 | "bill_to": self.account,
52 | },
53 | ).json()
54 |
55 | try:
56 | measurement_id = response["measurements"][0]
57 | break
58 | except KeyError:
59 | logger.info(response)
60 | logger.warning("Too much measurements.", "Waiting.")
61 | time.sleep(60)
62 | else:
63 | raise Exception("Too much measurements. Stopping.")
64 |
65 | if not response:
66 | return
67 |
68 | try:
69 | return measurement_id
70 | except (IndexError, KeyError):
71 | return
72 |
73 | def traceroute_measurement(self, target, probes_selector, options):
74 | ripe_key, description, tags, is_public, packets, protocol = options
75 |
76 | core_parameters = {
77 | "target": target,
78 | "af": 4,
79 | "description": description,
80 | "resolve_on_probe": False,
81 | "type": "traceroute",
82 | "tags": tags,
83 | "is_public": is_public,
84 | }
85 |
86 | traceroute_parameters = {
87 | "packets": packets,
88 | "protocol": protocol,
89 | }
90 |
91 | parameters = {}
92 | parameters.update(core_parameters)
93 | parameters.update(traceroute_parameters)
94 |
95 | definitions = [parameters]
96 |
97 | response = requests.post(
98 | f"https://atlas.ripe.net/api/v2/measurements/?key={ripe_key}",
99 | json={
100 | "definitions": definitions,
101 | "probes": [probes_selector],
102 | "is_oneoff": True,
103 | "bill_to": self.account,
104 | },
105 | ).json()
106 | return response
107 |
108 | def __str__(self):
109 | return "RIPE Atlas"
110 |
111 |
112 | def ripe_traceroute_to_csv(traceroute):
113 | protocols = {"ICMP": 1, "TCP": 6, "UDP": 17}
114 | rows = []
115 | try:
116 | src_addr = traceroute["from"]
117 | dst_addr = traceroute["dst_addr"]
118 | af = traceroute["af"]
119 | if af == 4:
120 | dst_prefix = ".".join(dst_addr.split(".")[:3] + ["0"])
121 | elif af == 6:
122 | dst_prefix = str(
123 | ipaddress.ip_network(dst_addr + "/48", strict=False).network_address
124 | )
125 | except (KeyError, ValueError):
126 | return rows
127 |
128 | for hop in traceroute["result"]:
129 | for response in hop.get("result", []):
130 | if not response or response.get("error"):
131 | continue
132 | if response.get("x") == "*" or not response.get("rtt"):
133 | response["from"] = "*"
134 | response["rtt"] = 0
135 | response["ttl"] = 0
136 | proto = protocols[traceroute["proto"]]
137 | try:
138 | row = (
139 | src_addr,
140 | dst_prefix,
141 | dst_addr,
142 | response["from"],
143 | proto,
144 | hop["hop"],
145 | response["rtt"],
146 | response["ttl"],
147 | traceroute["prb_id"],
148 | traceroute["msm_id"],
149 | traceroute["timestamp"],
150 | )
151 | row_str = "".join(f",{x}" for x in row)[1:]
152 | rows.append(row_str)
153 | except Exception:
154 | print("ERROR", response)
155 |
156 | return rows
157 |
158 |
159 | def fetch_traceroutes_from_measurement_ids_no_csv(
160 | measurement_ids, start=None, stop=None
161 | ):
162 | res = []
163 | for measurement_id in measurement_ids:
164 | result_url = (
165 | f"https://atlas.ripe.net/api/v2/measurements/{measurement_id}/results/?"
166 | )
167 | if start:
168 | result_url += f"start={start}"
169 | if stop:
170 | result_url += f"&stop={stop}"
171 | traceroutes = requests.get(result_url).json()
172 | if "error" in traceroutes:
173 | print(traceroutes)
174 | continue
175 | for traceroute in traceroutes:
176 | rows = ripe_traceroute_to_csv(traceroute)
177 | for row in rows:
178 | res.append(row)
179 | return res
180 |
181 |
182 | def wait_for(measurement_id: str, max_retry: int = 30) -> None:
183 | for _ in range(max_retry):
184 | response = requests.get(
185 | f"https://atlas.ripe.net/api/v2/measurements/{measurement_id}/"
186 | ).json()
187 |
188 | # check if measurement is ongoing or not
189 | if response["status"]["name"] != "Ongoing":
190 | return response
191 |
192 | time.sleep(10)
193 |
194 | return None
195 |
196 |
197 | def get_prefix_from_ip(addr):
198 | """from an ip addr return /24 prefix"""
199 | prefix = addr.split(".")[:-1]
200 | prefix.append("0")
201 | prefix = ".".join(prefix)
202 | return prefix
203 |
204 |
205 | def get_target_hitlist(target_prefix, nb_targets, targets_per_prefix):
206 | """from ip, return a list of target ips"""
207 | target_addr_list = []
208 | try:
209 | target_addr_list = targets_per_prefix[target_prefix]
210 | except KeyError:
211 | pass
212 |
213 | target_addr_list = list(set(target_addr_list))
214 |
215 | if len(target_addr_list) < nb_targets:
216 | prefix = IPv4Network(target_prefix + "/24")
217 | target_addr_list.extend(
218 | [
219 | str(prefix[randint(1, 254)])
220 | for _ in range(0, nb_targets - len(target_addr_list))
221 | ]
222 | )
223 |
224 | if len(target_addr_list) > nb_targets:
225 | target_addr_list = target_addr_list[:nb_targets]
226 |
227 | return target_addr_list
228 |
229 |
230 | def is_geoloc_disputed(probe: dict) -> bool:
231 | """check if geoloc disputed flag is contained in probe metadata"""
232 |
233 | tags = probe["tags"]
234 | for tag in tags:
235 | if tag["slug"] == "system-geoloc-disputed":
236 | return True
237 | return False
238 |
239 |
240 | def get_measurement_url(measurement_id: int) -> str:
241 | """return Atlas API url for get measurement request"""
242 |
243 | return f"https://atlas.ripe.net/api/v2/measurements/{measurement_id}/results/"
244 |
245 |
246 | def get_response(url: str, max_retry: int = 60, wait_time: int = 2) -> list:
247 | """request to Atlas API"""
248 |
249 | for _ in range(max_retry):
250 | response = requests.get(url)
251 |
252 | # small parsing, as response might not be Json formatted
253 | try:
254 | response = json.loads(response.content)
255 | except json.JSONDecodeError:
256 | response = response.content.decode()
257 | response = response.replace("}{", "}, {")
258 | response = response.replace("} {", "}, {")
259 | response = json.loads(response)
260 |
261 | if response != []:
262 | break
263 | time.sleep(wait_time)
264 |
265 | return response
266 |
267 |
268 | def parse_measurements_results(response: list) -> dict:
269 | """from get Atlas measurement request return parsed results"""
270 |
271 | # parse response
272 | measurement_results = defaultdict(dict)
273 | for result in response:
274 | # parse results and calculate geoloc
275 | if result.get("result") is not None:
276 | dst_addr = result["dst_addr"]
277 | vp_addr = result["from"]
278 |
279 | if type(result["result"]) == list:
280 | rtt_list = [list(rtt.values())[0] for rtt in result["result"]]
281 | else:
282 | rtt_list = [result["result"]["rtt"]]
283 |
284 | # remove stars from results
285 | rtt_list = list(filter(lambda x: x != "*", rtt_list))
286 | if not rtt_list:
287 | continue
288 |
289 | # sometimes connection error with vantage point cause result to be string message
290 | try:
291 | min_rtt = min(rtt_list)
292 | except TypeError:
293 | continue
294 |
295 | if isinstance(min_rtt, str):
296 | continue
297 |
298 | measurement_results[dst_addr][vp_addr] = {
299 | "node": vp_addr,
300 | "min_rtt": min_rtt,
301 | "rtt_list": rtt_list,
302 | }
303 |
304 | else:
305 | logger.warning(f"no results: {result}")
306 |
307 | # order vps per increasing rtt
308 | for dst_addr in measurement_results:
309 | measurement_results[dst_addr] = OrderedDict(
310 | {
311 | vp: results
312 | for vp, results in sorted(
313 | measurement_results[dst_addr].items(),
314 | key=lambda item: item[1]["min_rtt"],
315 | )
316 | }
317 | )
318 |
319 | return measurement_results
320 |
321 |
322 | def get_measurement_from_id(
323 | measurement_id: int,
324 | max_retry: int = 60,
325 | wait_time: int = 10,
326 | ) -> dict:
327 | """retrieve measurement results from RIPE Atlas with measurement id"""
328 |
329 | url = get_measurement_url(measurement_id)
330 |
331 | response = get_response(url, max_retry=max_retry, wait_time=wait_time)
332 |
333 | return response
334 |
335 |
336 | def get_measurements_from_tag(tag: str) -> dict:
337 | """retrieve all measurements that share the same tag and return parsed measurement results"""
338 |
339 | url = f"https://atlas.ripe.net/api/v2/measurements/tags/{tag}/results/"
340 |
341 | response = get_response(url, max_retry=1, wait_time=1)
342 |
343 | return response
344 |
345 |
346 | def get_from_atlas(url: str):
347 | """get request url atlas endpoint"""
348 | response = requests.get(url).json()
349 | while True:
350 | for anchor in response["results"]:
351 | yield anchor
352 |
353 | if response["next"]:
354 | response = requests.get(response["next"]).json()
355 | else:
356 | break
357 |
358 |
359 | def get_atlas_probes() -> list:
360 | """return all connected atlas probes"""
361 | probes = []
362 | rejected = 0
363 | geoloc_disputed = 0
364 | for _, probe in enumerate(get_from_atlas("https://atlas.ripe.net/api/v2/probes/")):
365 | # filter probes based on generic criteria
366 | if not probe["is_anchor"]:
367 | if (
368 | probe["status"]["name"] != "Connected"
369 | or probe.get("geometry") is None
370 | or probe.get("address_v4") is None
371 | or probe.get("country_code") is None
372 | ):
373 | rejected += 1
374 | continue
375 |
376 | if is_geoloc_disputed(probe):
377 | geoloc_disputed += 1
378 | continue
379 |
380 | reduced_probe = {
381 | "id": probe["id"],
382 | "address_v4": probe["address_v4"],
383 | "asn_v4": probe["asn_v4"],
384 | "country_code": probe["country_code"],
385 | "geometry": probe["geometry"],
386 | }
387 | probes.append(reduced_probe)
388 |
389 | return probes, rejected, geoloc_disputed
390 |
391 |
392 | def get_atlas_anchors() -> list:
393 | """return all atlas anchors"""
394 | anchors = []
395 | rejected = 0
396 | geoloc_disputed = 0
397 | for _, anchor in enumerate(get_from_atlas("https://atlas.ripe.net/api/v2/probes/")):
398 | # filter anchors based on generic criteria
399 | if anchor["is_anchor"]:
400 | if (
401 | anchor["status"]["name"] != "Connected"
402 | or anchor.get("geometry") is None
403 | or anchor.get("address_v4") is None
404 | or anchor.get("country_code") is None
405 | ):
406 | rejected += 1
407 | continue
408 |
409 | if is_geoloc_disputed(anchor):
410 | geoloc_disputed += 1
411 | continue
412 |
413 | reduced_anchor = {
414 | "id": anchor["id"],
415 | "address_v4": anchor["address_v4"],
416 | "asn_v4": anchor["asn_v4"],
417 | "country_code": anchor["country_code"],
418 | "geometry": anchor["geometry"],
419 | "id": anchor["id"],
420 | }
421 | anchors.append(reduced_anchor)
422 |
423 | return anchors, rejected, geoloc_disputed
424 |
--------------------------------------------------------------------------------
/scripts/ripe_atlas/ping_and_traceroute_classes.py:
--------------------------------------------------------------------------------
1 | # Two classes to instantiate before calling RIPE Atlas API: one for ping measurements and one for traceroute measurements
2 |
3 | import time
4 |
5 | from pprint import pprint
6 | from copy import copy
7 |
8 | from logger import logger
9 | from scripts.ripe_atlas.atlas_api import RIPEAtlas, wait_for, get_target_hitlist
10 | from scripts.utils.credentials import get_ripe_atlas_credentials
11 |
12 |
13 | MAX_NUMBER_OF_VPS = 1_000
14 | NB_MAX_CONCURRENT_MEASUREMENTS = 90
15 | NB_PACKETS = 3
16 | NB_TARGETS_PER_PREFIX = 3
17 |
18 |
19 | class PING:
20 | def __init__(
21 | self,
22 | ) -> None:
23 | ripe_credentials = get_ripe_atlas_credentials()
24 |
25 | self.account = ripe_credentials["username"]
26 | self.key = ripe_credentials["secret_key"]
27 |
28 | self.driver = RIPEAtlas(self.account, self.key)
29 |
30 | def ping_by_prefix(
31 | self,
32 | target_prefixes: list,
33 | vps: dict,
34 | targets_per_prefix: dict,
35 | tag: str,
36 | nb_packets: int = NB_PACKETS,
37 | nb_targets: int = NB_TARGETS_PER_PREFIX,
38 | dry_run: bool = False,
39 | ):
40 | """from a list of prefixes, start measurements for n target addrs in prefix"""
41 |
42 | active_measurements = []
43 | all_measurement_ids = []
44 | start_time = time.time()
45 | for i, target_prefix in enumerate(target_prefixes):
46 |
47 | logger.info(
48 | f"Ping for target prefix:: {target_prefix}, {i+1}/{len(target_prefixes)}"
49 | )
50 |
51 | # get target_addr_list
52 | target_addr_list = get_target_hitlist(
53 | target_prefix, nb_targets, targets_per_prefix
54 | )
55 |
56 | # get vps id for measurement, remove target if in vps
57 |
58 | logger.debug(
59 | f"starting measurement for {target_prefix} with {[addr for addr in target_addr_list]}"
60 | )
61 |
62 | for target_addr in target_addr_list:
63 | vp_ids = [vp["id"] for vp in vps if vp["address_v4"] != target_addr]
64 | for i in range(0, len(vp_ids), MAX_NUMBER_OF_VPS):
65 | subset_vp_ids = vp_ids[i : i + MAX_NUMBER_OF_VPS]
66 |
67 | logger.debug(
68 | f"starting measurement for {target_addr} with {len(subset_vp_ids)} vps"
69 | )
70 |
71 | if not dry_run:
72 | measurement_id = self.driver.ping(
73 | str(target_addr), subset_vp_ids, str(tag), nb_packets
74 | )
75 |
76 | logger.info(
77 | f"measurement tag: {tag} : started measurement id : {measurement_id}"
78 | )
79 | else:
80 | measurement_id = 404
81 |
82 | active_measurements.append(measurement_id)
83 | all_measurement_ids.append(measurement_id)
84 |
85 | # check number of parallel measurements in not too high
86 | if len(active_measurements) >= NB_MAX_CONCURRENT_MEASUREMENTS:
87 | logger.info(
88 | f"Reached limit for number of concurrent measurements: {len(active_measurements)}"
89 | )
90 | tmp_measurement_ids = copy(active_measurements)
91 | for id in tmp_measurement_ids:
92 | # wait for the last measurement of the batch to end before starting a new one
93 | if not dry_run:
94 | measurement_result = wait_for(id)
95 | if measurement_result:
96 | active_measurements.remove(id)
97 | else:
98 | active_measurements.remove(id)
99 | time.sleep(0.5)
100 |
101 | logger.info(f"measurement : {tag} done")
102 |
103 | end_time = time.time()
104 |
105 | return all_measurement_ids, start_time, end_time
106 |
107 | def ping_by_target(
108 | self,
109 | targets: list[dict],
110 | vps: list[dict],
111 | tag: str,
112 | nb_packets: int = NB_PACKETS,
113 | dry_run: bool = False,
114 | ):
115 | """from a list of prefixes, start measurements for n target addrs in prefix"""
116 |
117 | active_measurements = []
118 | all_measurement_ids = []
119 | start_time = time.time()
120 | for i, target_addr in enumerate(targets):
121 | logger.info(f"Ping for target:: {target_addr}, {i+1}/{len(targets)}")
122 |
123 | # get vps id for measurement, remove target if in vps
124 | vp_ids = [vp["id"] for vp in vps if vp["address_v4"] != target_addr]
125 |
126 | for i in range(0, len(vp_ids), MAX_NUMBER_OF_VPS):
127 | subset_vp_ids = vp_ids[i : i + MAX_NUMBER_OF_VPS]
128 |
129 | logger.debug(
130 | f"starting measurement for {target_addr} with {len(subset_vp_ids)} vps"
131 | )
132 |
133 | if not dry_run:
134 | measurement_id = self.driver.ping(
135 | str(target_addr), subset_vp_ids, str(tag), nb_packets
136 | )
137 | else:
138 | measurement_id = 404
139 |
140 | active_measurements.append(measurement_id)
141 | all_measurement_ids.append(measurement_id)
142 |
143 | logger.info(
144 | f"measurement tag: {tag} : started measurement id : {measurement_id}"
145 | )
146 |
147 | # check number of parallel measurements in not too high
148 | if len(active_measurements) >= NB_MAX_CONCURRENT_MEASUREMENTS:
149 | logger.info(
150 | f"Reached limit for number of concurrent measurements: {len(active_measurements)}"
151 | )
152 | tmp_measurement_ids = copy(active_measurements)
153 | for id in tmp_measurement_ids:
154 | # wait for the last measurement of the batch to end before starting a new one
155 | if not dry_run:
156 | measurement_result = wait_for(id)
157 | if measurement_result:
158 | active_measurements.remove(id)
159 | else:
160 | active_measurements.remove(id)
161 | time.sleep(0.5)
162 |
163 | logger.info(f"measurement : {tag} done")
164 |
165 | end_time = time.time()
166 |
167 | return all_measurement_ids, start_time, end_time
168 |
169 |
170 | class TRACEROUTE:
171 | def __init__(
172 | self,
173 | ) -> None:
174 | ripe_credentials = get_ripe_atlas_credentials()
175 |
176 | self.account = ripe_credentials["username"]
177 | self.key = ripe_credentials["secret_key"]
178 | self.driver = RIPEAtlas(self.account, self.key)
179 |
180 | def traceroute(self, target, probe_id):
181 | description = "Geoloc project"
182 | tags = ["traceroute", "test", "geoloc"]
183 | is_public = True
184 | probes = {"value": str(probe_id), "type": "probes", "requested": 1}
185 | packets = 3
186 | protocol = "ICMP"
187 | options = (self.key, description, tags, is_public, packets, protocol)
188 |
189 | response = self.driver.traceroute_measurement(target, probes, options)
190 |
191 | if "measurements" in response and len(response["measurements"]) == 1:
192 | return response["measurements"][0]
193 | else:
194 | print(f"Failed to traceroute")
195 | pprint(response)
196 | return None
197 |
--------------------------------------------------------------------------------
/scripts/street_level/landmark.py:
--------------------------------------------------------------------------------
1 | # Do the landmark selection step as explained in the street level paper
2 |
3 | import requests
4 | import overpy
5 | import dns
6 | import dns.resolver
7 | import urllib3
8 | import pyasn
9 | import warnings
10 |
11 | from multiprocessing import Pool
12 | from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
13 | from geopy import Point, distance
14 |
15 | from scripts.utils.file_utils import load_json, dump_json
16 | from default import CACHED_WEBSITES_FILE, IP_TO_ASN_FILE
17 |
18 |
19 | warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning)
20 | urllib3.disable_warnings()
21 |
22 |
23 | def get_bounding_box(lat, lon):
24 | p = Point(lat, lon)
25 | d = distance.distance(kilometers=2).meters
26 | top_right = distance.distance(meters=d).destination(p, 45)
27 | bottom_left = distance.distance(meters=d).destination(p, 225)
28 | return (bottom_left.latitude, bottom_left.longitude, top_right.latitude, top_right.longitude)
29 |
30 |
31 | def check_domain_name_ip(domain_name, ip_address, protocol):
32 | # print(f"Checking {domain_name}")
33 | ip_url = protocol + "://" + ip_address
34 | domain_url = protocol + "://" + domain_name
35 | try:
36 | ip_response = requests.get(ip_url, verify=False, timeout=1)
37 | if ip_response.status_code != 200:
38 | return False
39 | domain_response = requests.get(domain_url, timeout=1)
40 | if domain_response.status_code != 200:
41 | return False
42 | except Exception:
43 | # print(traceback.format_exc())
44 | return False
45 |
46 | try:
47 | ip_soup = BeautifulSoup(ip_response.content, "html.parser")
48 | domain_soup = BeautifulSoup(domain_response.content, "html.parser")
49 | ip_title = ip_soup.head.title.text
50 | domain_title = domain_soup.head.title.text
51 | if ip_title == domain_title:
52 | return True
53 | else:
54 | return False
55 | except:
56 | return False
57 |
58 |
59 | def check_and_get_website_ip(website, protocol):
60 | asns = ['20940', '16625', '12222', '16625', '21342', '21399', '32787', '35994', '35993', '35995', '36408', '393234', '394689',
61 | '13335', '202018', '202109', '133293', '395747',
62 | '54113', '209242',
63 | '16509', '14618', '16509', '39111', '16509',
64 | '8075', '8075', '8075', '12076', '12222',
65 | '15169', '36351', '22577', '36040', '55023',
66 | '22822',
67 | '701', '22394, 11608, 11608',
68 | '3356', '133229, 133229, 395570',
69 | '60068', '136620', '395354',
70 | '32934']
71 | res = {}
72 | asndb = pyasn.pyasn(str(IP_TO_ASN_FILE))
73 | try:
74 | result = dns.resolver.resolve(website)
75 | except Exception:
76 | # print(traceback.format_exc())
77 | return {'dns-failed': True}
78 | if len(result) == 0:
79 | return {'dns-failed': True}
80 | res = {'dns-failed': False}
81 |
82 | ip = result[0].to_text()
83 | res['ip'] = ip
84 | asn = asndb.lookup(ip)[0]
85 | if asn == None:
86 | res['asn-found'] = False
87 | return res
88 | else:
89 | res['asn-found'] = True
90 | if str(asn) in asns or 'google' in website or 'facebook' in website or 'amazon' in website or 'microsoft' in website or 'azure' in website or 'akamai' in website or 'cdn' in website:
91 | res['cdn'] = True
92 | return res
93 | else:
94 | res['cdn'] = False
95 |
96 | if check_domain_name_ip(website, ip, protocol):
97 | res['header-test'] = True
98 | return res
99 | else:
100 | res['header-test'] = False
101 | return res
102 |
103 |
104 | def get_one_website_ip(domain, protocol, lat, lon):
105 | ip_info = check_and_get_website_ip(domain, protocol)
106 | ip_info['domain'] = domain
107 | ip_info['protocol'] = protocol
108 | ip_info['lat'] = lat
109 | ip_info['lon'] = lon
110 | return ip_info
111 |
112 |
113 | def get_landmarks_with_website_from_lat_lon(lat_arg, lon_arg):
114 | # api = overpy.Overpass()
115 | # api = overpy.Overpass(url="https://overpass.kumi.systems/api/interpreter")
116 | api = overpy.Overpass(
117 | url="https://maps.mail.ru/osm/tools/overpass/api/interpreter")
118 | bbox = get_bounding_box(lat_arg, lon_arg)
119 | query = f"""
120 | [out:json];
121 | (
122 | node ({bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]})
123 | [website];
124 | way ({bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]})
125 | [website];
126 | );
127 | out;
128 | """
129 | result = api.query(query)
130 | res = []
131 | for node in result.nodes:
132 | lat = float(node.lat)
133 | lon = float(node.lon)
134 | tags = node.tags
135 | website = tags['website']
136 | res.append((website, lat, lon))
137 | for way in result.ways:
138 | try:
139 | tmp_lat = 0
140 | tmp_lon = 0
141 | nodes = way.get_nodes(resolve_missing=True)
142 | for node in nodes:
143 | tmp_lat += float(node.lat)
144 | tmp_lon += float(node.lon)
145 | lat = tmp_lat/len(nodes)
146 | lon = tmp_lon/len(nodes)
147 | tags = way.tags
148 | website = tags['website']
149 | res.append((website, lat, lon))
150 | except:
151 | continue
152 | return res
153 |
154 |
155 | def get_all_landmarks_and_stats_from_points(points):
156 | dict_website = {}
157 | with Pool(8) as pool:
158 | results = pool.starmap(get_landmarks_with_website_from_lat_lon, points)
159 | for result in results:
160 | if result != None and result != []:
161 | for elem in result:
162 | dict_website[elem[0]] = elem
163 |
164 | unique_website = {}
165 | for url in dict_website:
166 | if "://" in url:
167 | protocol = url.split("://")[0]
168 | domain_name = url.split("://")[1]
169 | else:
170 | protocol = "http"
171 | domain_name = url
172 | website = domain_name.split("/")[0]
173 | if (website, protocol) not in unique_website:
174 | unique_website[(website, protocol)] = dict_website[url]
175 |
176 | args = []
177 | failed_dns_count = 0
178 | failed_asn_count = 0
179 | cdn_count = 0
180 | failed_header_test_count = 0
181 | landmarks = []
182 |
183 | try:
184 | all_websites = load_json(CACHED_WEBSITES_FILE)
185 | except FileNotFoundError:
186 | all_websites = {}
187 |
188 | for k, v in unique_website.items():
189 | # fix websites
190 | if 'google' in k or 'facebook' in k or 'amazon' in k or 'microsoft' in k or 'azure' in k or 'akamai' in k or 'cdn' in k:
191 | all_websites[k]['cdn'] = True
192 |
193 | if k[0] not in all_websites:
194 | args.append((k[0], k[1], v[1], v[2]))
195 | else:
196 | result = all_websites[k[0]]
197 | if 'dns-failed' not in result or result['dns-failed']:
198 | failed_dns_count += 1
199 | continue
200 | if 'asn-found' not in result or not result['asn-found']:
201 | failed_asn_count += 1
202 | continue
203 | if 'cdn' not in result or result['cdn']:
204 | cdn_count += 1
205 | continue
206 | if 'header-test' not in result or not result['header-test']:
207 | failed_header_test_count += 1
208 | continue
209 | landmarks.append(
210 | (result['ip'], result['domain'], result['lat'], result['lon']))
211 |
212 | with Pool() as pool:
213 | results = pool.starmap(get_one_website_ip, args)
214 | for result in results:
215 | all_websites[result['domain']] = result
216 | if 'dns-failed' not in result or result['dns-failed']:
217 | failed_dns_count += 1
218 | continue
219 | if 'asn-found' not in result or not result['asn-found']:
220 | failed_asn_count += 1
221 | continue
222 | if 'cdn' not in result or result['cdn']:
223 | cdn_count += 1
224 | continue
225 | if 'header-test' not in result or not result['header-test']:
226 | failed_header_test_count += 1
227 | continue
228 | landmarks.append(
229 | (result['ip'], result['domain'], result['lat'], result['lon']))
230 |
231 | dump_json(all_websites, CACHED_WEBSITES_FILE)
232 |
233 | return failed_dns_count, failed_asn_count, cdn_count, failed_header_test_count, landmarks
234 |
--------------------------------------------------------------------------------
/scripts/street_level/three_tiers.py:
--------------------------------------------------------------------------------
1 | # One function per tier of the street level method.
2 |
3 | import time
4 |
5 | from scripts.analysis.analysis import local_circle_preprocessing
6 | from scripts.street_level.landmark import get_all_landmarks_and_stats_from_points
7 | from scripts.utils.helpers import get_center_of_poly, get_points_in_poly
8 | from scripts.street_level.traceroutes_results import (
9 | get_circles_to_target,
10 | start_and_get_traceroutes,
11 | )
12 |
13 |
14 | def tier_1(target_ip, res, vps=None):
15 | st = time.time()
16 | # Get all circles (from each VP to the target)
17 | all_circles = get_circles_to_target(target_ip, vps)
18 |
19 | # Try the recommended internet speed at first
20 | speed_threshold = 4 / 9
21 | imp_circles = local_circle_preprocessing(
22 | all_circles, speed_threshold=speed_threshold
23 | )
24 | lat, lon = get_center_of_poly(imp_circles, speed_threshold)
25 |
26 | # If there is no intersection polygone try a slower interent speed
27 | if lat == None or lon == None:
28 | speed_threshold = 2 / 3
29 | imp_circles = local_circle_preprocessing(
30 | all_circles, speed_threshold=speed_threshold
31 | )
32 | lat, lon = get_center_of_poly(imp_circles, speed_threshold)
33 | res["speed_threshold"] = speed_threshold
34 | res["tier1:lat"] = lat
35 | res["tier1:lon"] = lon
36 | res["vps"] = imp_circles
37 | et = time.time()
38 | # Saving the time needed to perform this step
39 | res["tier1:duration"] = et - st
40 | return res
41 |
42 |
43 | def tier_2(target_ip, res, vps=None):
44 | st = time.time()
45 | tier2_points = get_points_in_poly(res["vps"], 36, 5, res["speed_threshold"])
46 | res["tier2:all_points_count"] = len(tier2_points)
47 |
48 | # We remove points further than 1000km from the estimated center of the polygone (in case the intersection area is too big)
49 | tier2_points = tier2_points[: 200 * 10 + 1]
50 | res["tier2:inspected_points_count"] = len(tier2_points)
51 | if len(tier2_points) == 0:
52 | res["tier2:lat"] = None
53 | res["tier2:lon"] = None
54 | et = time.time()
55 | res["tier2:duration"] = et - st
56 | return res
57 |
58 | (
59 | failed_dns_count,
60 | failed_asn_count,
61 | cdn_count,
62 | failed_header_test_count,
63 | landmarks,
64 | ) = get_all_landmarks_and_stats_from_points(tier2_points)
65 | # We save stats for possiblity of a website to be used as a landmark
66 | res["tier2:failed_dns_count"] = failed_dns_count
67 | res["tier2:failed_asn_count"] = failed_asn_count
68 | res["tier2:cdn_count"] = cdn_count
69 | res["tier2:non_cdn_count"] = len(landmarks) + failed_header_test_count
70 | res["tier2:landmark_count"] = len(landmarks)
71 | res["tier2:failed_header_test_count"] = failed_header_test_count
72 | res["tier2:landmarks"] = landmarks
73 |
74 | if len(res["tier2:landmarks"]) == 0:
75 | res["tier2:lat"] = None
76 | res["tier2:lon"] = None
77 | et = time.time()
78 | res["tier2:duration"] = et - st
79 | return res
80 |
81 | res["tier2:traceroutes"] = start_and_get_traceroutes(
82 | target_ip, res["vps"], res["tier2:landmarks"], vps
83 | )
84 | all_circles = []
85 | best_rtt = 5000
86 | res_lat = None
87 | res_lon = None
88 | for probe_ip, target_ip, landmark_ip, r1ip, rtt, lat, lon, traceroute_id in res[
89 | "tier2:traceroutes"
90 | ]:
91 | if rtt < 0:
92 | continue
93 | all_circles.append((lat, lon, rtt, None, None))
94 | if rtt < best_rtt:
95 | best_rtt = rtt
96 | res_lat = lat
97 | res_lon = lon
98 |
99 | # If there is no valid RTT then tier 2 has failed and we can not go further
100 | if len(all_circles) == 0:
101 | res["tier2:lat"] = None
102 | res["tier2:lon"] = None
103 | et = time.time()
104 | res["tier2:duration"] = et - st
105 | return res
106 |
107 | # If not, we use the smallest rtt landmark as
108 | res["tier2:lat"] = res_lat
109 | res["tier2:lon"] = res_lon
110 | res["tier2:final_circles"] = all_circles
111 | et = time.time()
112 | res["tier2:duration"] = et - st
113 | return res
114 |
115 |
116 | def tier_3(target_ip, res, vps=None):
117 | st = time.time()
118 | if "tier2:final_circles" not in res:
119 | res["tier3:lat"] = None
120 | res["tier3:lon"] = None
121 | et = time.time()
122 | res["tier3:duration"] = et - st
123 | return res
124 |
125 | else:
126 | all_circles = res["tier2:final_circles"]
127 |
128 | imp_circles = local_circle_preprocessing(
129 | all_circles, speed_threshold=res["speed_threshold"]
130 | )
131 | tier3_points = get_points_in_poly(
132 | imp_circles, 10, 1, res["speed_threshold"], res["vps"]
133 | )
134 | res["tier3:all_points_count"] = len(tier3_points)
135 |
136 | # We remove points/zipcodes further then 40Km away from the center of the polygone
137 | tier3_points = tier3_points[: 40 * 36 + 1]
138 | res["tier3:inspected_points_count"] = len(tier3_points)
139 | if len(tier3_points) == 0:
140 | res["tier3:lat"] = None
141 | res["tier3:lon"] = None
142 | et = time.time()
143 | res["tier3:duration"] = et - st
144 | return res
145 |
146 | (
147 | failed_dns_count,
148 | failed_asn_count,
149 | cdn_count,
150 | failed_header_test_count,
151 | tmp_landmarks,
152 | ) = get_all_landmarks_and_stats_from_points(tier3_points)
153 | landmarks = []
154 | for landmark in tmp_landmarks:
155 | ip = landmark[0]
156 | found = False
157 | for t2_lm in res["tier2:landmarks"]:
158 | if t2_lm[0] == ip:
159 | found = True
160 | break
161 | if not found:
162 | landmarks.append(landmark)
163 |
164 | res["tier3:failed_dns_count"] = failed_dns_count
165 | res["tier3:failed_asn_count"] = failed_asn_count
166 | res["tier3:cdn_count"] = cdn_count
167 | res["tier3:non_cdn_count"] = len(landmarks) + failed_header_test_count
168 | res["tier3:landmark_count"] = len(landmarks)
169 | res["tier3:failed_header_test_count"] = failed_header_test_count
170 | res["tier3:landmarks"] = landmarks
171 |
172 | if len(res["tier3:landmarks"]) == 0:
173 | res["tier3:lat"] = None
174 | res["tier3:lon"] = None
175 | et = time.time()
176 | res["tier3:duration"] = et - st
177 | return res
178 |
179 | res["tier3:traceroutes"] = start_and_get_traceroutes(
180 | target_ip, res["vps"], res["tier3:landmarks"], vps
181 | )
182 |
183 | best_lon = None
184 | best_lat = None
185 | best_rtt = 5000
186 | for probe_ip, target_ip, landmark_ip, r1ip, rtt, lat, lon, traceroute_id in res[
187 | "tier2:traceroutes"
188 | ]:
189 | if rtt < 0:
190 | continue
191 | if rtt < best_rtt:
192 | best_rtt = rtt
193 | best_lon = lon
194 | best_lat = lat
195 | for probe_ip, target_ip, landmark_ip, r1ip, rtt, lat, lon, traceroute_id in res[
196 | "tier3:traceroutes"
197 | ]:
198 | if rtt < 0:
199 | continue
200 | if rtt < best_rtt:
201 | best_rtt = rtt
202 | best_lon = lon
203 | best_lat = lat
204 |
205 | res["tier3:lat"] = best_lat
206 | res["tier3:lon"] = best_lon
207 | et = time.time()
208 | res["tier3:duration"] = et - st
209 | return res
210 |
211 |
212 | def get_all_info_geoloc(target_ip, vps=None):
213 | # Init results
214 | res = {
215 | "target_ip": target_ip,
216 | "tier1:done": False,
217 | "tier2:done": False,
218 | "tier3:done": False,
219 | "negative_rtt_included": True,
220 | }
221 | res = tier_1(target_ip, res, vps=vps)
222 |
223 | # Using tier 1(CBG) results as geolocation if the other steps fail
224 | res["lat"] = res["tier1:lat"]
225 | res["lon"] = res["tier1:lon"]
226 | if res["tier1:lat"] == None or res["tier1:lon"] == None:
227 | return res
228 | res["tier1:done"] = True
229 |
230 | res = tier_2(target_ip, res, vps=vps)
231 |
232 | # Using tier 2 resultsas geolocation if the last step fails
233 | if res["tier2:lat"] == None or res["tier2:lon"] == None:
234 | return res
235 | else:
236 | res["tier2:done"] = True
237 | res["lat"] = res["tier2:lat"]
238 | res["lon"] = res["tier2:lon"]
239 |
240 | res = tier_3(target_ip, res, vps=vps)
241 |
242 | if res["tier3:lat"] != None and res["tier3:lon"] != None:
243 | res["tier3:done"] = True
244 | res["lat"] = res["tier3:lat"]
245 | res["lon"] = res["tier3:lon"]
246 |
247 | return res
248 |
249 |
250 | def geoloc(target_ip):
251 | """
252 | This function return a dict containint the lat, lon coordinates of the given target_ip.
253 | The target_ip should be traceroutable.
254 | The function gives a less informative gelocation result than get_all_info_geoloc
255 | """
256 | all_info = get_all_info_geoloc(target_ip)
257 | return {"lat": all_info["lat"], "lon": all_info["lon"]}
258 |
--------------------------------------------------------------------------------
/scripts/street_level/traceroutes_results.py:
--------------------------------------------------------------------------------
1 | """Intermediate functions during street level traceroutes process"""
2 |
3 | import time
4 |
5 | from scripts.utils.clickhouse import Clickhouse
6 | from scripts.utils.file_utils import load_json
7 | from scripts.ripe_atlas.ping_and_traceroute_classes import TRACEROUTE
8 | from scripts.ripe_atlas.atlas_api import fetch_traceroutes_from_measurement_ids_no_csv
9 | from default import USER_ANCHORS_FILE, STREET_LEVEL_TRACEROUTES_TABLE
10 |
11 |
12 | def start_traceroutes_to_targets(targets, probes):
13 | results_to_get = []
14 | for target in targets:
15 | target_ip = target[0]
16 | for probe in probes:
17 | probe_ip = probe["address_v4"]
18 | probe_id = str(probe["id"])
19 | trace = TRACEROUTE()
20 | res = trace.traceroute(target_ip, probe_id)
21 | if res != None:
22 | results_to_get.append((res, probe_ip, target_ip))
23 | return results_to_get
24 |
25 |
26 | def get_traceroutes_results(traceroute_ids):
27 | next_to_do = []
28 | for id in traceroute_ids:
29 | next_to_do.append(id)
30 | nb_tries = 20
31 | while nb_tries > 0 and len(next_to_do) > 0:
32 | nb_tries -= 1
33 | to_do = []
34 | for id in next_to_do:
35 | to_do.append(id)
36 |
37 | next_to_do = []
38 |
39 | for id in to_do:
40 | try:
41 | ids = [id]
42 | traceroute_data = fetch_traceroutes_from_measurement_ids_no_csv(ids)
43 | if len(traceroute_data) == 0:
44 | next_to_do.append(id)
45 | else:
46 | insert_lst = []
47 | for t in traceroute_data:
48 | ts = t.split(",")
49 | insert_lst.append(
50 | (
51 | ts[0],
52 | ts[1],
53 | ts[2],
54 | ts[3],
55 | int(ts[4]),
56 | int(ts[5]),
57 | float(ts[6]),
58 | int(ts[7]),
59 | int(ts[8]),
60 | int(ts[9]),
61 | int(ts[10]),
62 | )
63 | )
64 | # We insert traceroute data into the database to be used later
65 | clickhouse_driver = Clickhouse()
66 | query = clickhouse_driver.insert_street_lvl_traceroutes_query(STREET_LEVEL_TRACEROUTES_TABLE)
67 | clickhouse_driver.execute(query, insert_lst)
68 | except Exception:
69 | next_to_do.append(id)
70 | if len(next_to_do) > 0:
71 | # We wait to try again
72 | time.sleep(15)
73 |
74 |
75 | """
76 | Function starts and fetches traceroute from all probes to all targets
77 | """
78 |
79 |
80 | def multi_traceroutes(targets, probes):
81 | tmp_res_traceroutes = start_traceroutes_to_targets(targets, probes)
82 | traceroute_ids = []
83 | for elem in tmp_res_traceroutes:
84 | traceroute_ids.append(elem[0])
85 |
86 | get_traceroutes_results(traceroute_ids)
87 | return tmp_res_traceroutes
88 |
89 |
90 | def tier_1_performe_traceroutes(target_ip, vps=None):
91 | # Traceroute from every VP to the target
92 | if vps == None:
93 | probes = load_json(USER_ANCHORS_FILE)
94 | else:
95 | probes = vps
96 | multi_traceroutes([[target_ip]], probes)
97 |
98 |
99 | def get_circles_to_target(target_ip, vps=None):
100 | # Get Rtts from all VPs to the targets if traceroutes are already done
101 | clickhouse_driver = Clickhouse()
102 | query = clickhouse_driver.get_all_rtt_to_dst_address_query(STREET_LEVEL_TRACEROUTES_TABLE, target_ip)
103 | res = clickhouse_driver.execute(query)
104 | # If None we need to lunch traceroutes from every VP to the target
105 | if len(res) == 0:
106 | tier_1_performe_traceroutes(target_ip, vps)
107 | res = clickhouse_driver.execute(query)
108 | if len(res) == 0:
109 | return []
110 |
111 | # Calculate per VP min RTT
112 | dict_rtt = {}
113 | for hop in res:
114 | if hop[0] not in dict_rtt:
115 | dict_rtt[hop[0]] = (hop[1], hop[2])
116 | if hop[2] > dict_rtt[hop[0]][1]:
117 | dict_rtt[hop[0]] = (hop[1], hop[2])
118 | if hop[2] == dict_rtt[hop[0]][1] and hop[1] < dict_rtt[hop[0]][0]:
119 | dict_rtt[hop[0]] = (hop[1], hop[2])
120 |
121 | # From IPs get Geolocation given by RIPE Atlas
122 | if vps == None:
123 | probes_data = load_json(USER_ANCHORS_FILE)
124 | else:
125 | probes_data = vps
126 | dict_probe_info = {}
127 | for probe in probes_data:
128 | if probe["address_v4"] == target_ip:
129 | continue
130 | if "address_v4" not in probe or probe["address_v4"] not in dict_rtt:
131 | continue
132 | if (
133 | "geometry" not in probe
134 | or "type" not in probe["geometry"]
135 | or probe["geometry"]["type"] != "Point"
136 | or "coordinates" not in probe["geometry"]
137 | ):
138 | continue
139 | lon, lat = probe["geometry"]["coordinates"]
140 | dict_probe_info[probe["address_v4"]] = (
141 | lat,
142 | lon,
143 | dict_rtt[probe["address_v4"]][0],
144 | None,
145 | None,
146 | )
147 |
148 | # Return a list of items
149 | # each Item is a VP (lat, lon, min_rtt, dist = None, dist_r = None)
150 | res = []
151 | for k, v in dict_probe_info.items():
152 | res.append(v)
153 | return res
154 |
155 |
156 | def get_rtt_diff(probe_ip, target_ip, landmark_ip):
157 | clickhouse_driver = Clickhouse()
158 | query = clickhouse_driver.get_all_rtt_from_probe_to_targets_query(STREET_LEVEL_TRACEROUTES_TABLE, probe_ip, target_ip, landmark_ip)
159 | res = clickhouse_driver.execute(query)
160 | rtt_dict_target = {}
161 | rtt_dict_landmark = {}
162 |
163 | for l in res:
164 | resp_ip = l[0]
165 | dst_ip = l[1]
166 | rtt = l[2]
167 | if dst_ip == target_ip:
168 | if resp_ip not in rtt_dict_target:
169 | rtt_dict_target[resp_ip] = rtt
170 | if rtt < rtt_dict_target[resp_ip]:
171 | rtt_dict_target[resp_ip] = rtt
172 | elif dst_ip == landmark_ip:
173 | if resp_ip not in rtt_dict_landmark:
174 | rtt_dict_landmark[resp_ip] = rtt
175 | if rtt < rtt_dict_landmark[resp_ip]:
176 | rtt_dict_landmark[resp_ip] = rtt
177 | if target_ip not in rtt_dict_target or landmark_ip not in rtt_dict_landmark:
178 | return -1, None
179 | target_rtt = rtt_dict_target[target_ip]
180 | landmark_rtt = rtt_dict_landmark[landmark_ip]
181 | same_dict = {}
182 | for ip in rtt_dict_target:
183 | if ip in rtt_dict_landmark:
184 | same_dict[ip] = min(rtt_dict_landmark[ip], rtt_dict_target[ip])
185 | best_rtt = 0
186 | best_ip = None
187 | for k, v in same_dict.items():
188 | if v > best_rtt:
189 | best_rtt = v
190 | best_ip = k
191 | return target_rtt + landmark_rtt - best_rtt - best_rtt, best_ip
192 |
193 |
194 | def get_probes_to_use_for_circles(circles, vps=None):
195 | if vps == None:
196 | probes_data = load_json(USER_ANCHORS_FILE)
197 | else:
198 | probes_data = vps
199 | lats_lons = {}
200 | for circle in circles:
201 | lats_lons[(circle[0], circle[1])] = circle
202 | res = []
203 | for probe in probes_data:
204 | if (
205 | "geometry" not in probe
206 | or "type" not in probe["geometry"]
207 | or probe["geometry"]["type"] != "Point"
208 | or "coordinates" not in probe["geometry"]
209 | ):
210 | continue
211 | lon, lat = probe["geometry"]["coordinates"]
212 | if (lat, lon) in lats_lons:
213 | res.append(probe)
214 | return res
215 |
216 |
217 | def start_and_get_traceroutes(target_ip, used_vps, landmarks, all_vps):
218 | probes = get_probes_to_use_for_circles(used_vps, all_vps)
219 | tmp_res_traceroutes = multi_traceroutes(landmarks, probes)
220 |
221 | # For each traceroute to a landmark we try to get the last common router/IP (r1ip) and the distance d1 + d2 (rtt)
222 | res = []
223 | for t in tmp_res_traceroutes:
224 | traceroute_id = t[0]
225 | probe_ip = t[1]
226 | landmark_ip = t[2]
227 | rtt, r1ip = get_rtt_diff(probe_ip, target_ip, landmark_ip)
228 | for landmark in landmarks:
229 | if landmark[0] == landmark_ip:
230 | res.append(
231 | (
232 | probe_ip,
233 | target_ip,
234 | landmark_ip,
235 | r1ip,
236 | rtt,
237 | landmark[2],
238 | landmark[3],
239 | traceroute_id,
240 | )
241 | )
242 | break
243 | return res
244 |
245 |
246 | def serialize(res1):
247 | res = {}
248 | for k, v in res1.items():
249 | res[k] = v
250 | if "vps" in res:
251 | tmp_lst = []
252 | for x in res["vps"]:
253 | tmp_lst.append(list(x))
254 | res["vps"] = tmp_lst
255 | if "tier2:landmarks" in res:
256 | tmp_lst = []
257 | for x in res["tier2:landmarks"]:
258 | tmp_lst.append(list(x))
259 | res["tier2:landmarks"] = tmp_lst
260 | if "tier2:traceroutes" in res:
261 | tmp_lst = []
262 | for x in res["tier2:traceroutes"]:
263 | tmp_lst.append(list(x))
264 | res["tier2:traceroutes"] = tmp_lst
265 | if "tier3:landmarks" in res:
266 | tmp_lst = []
267 | for x in res["tier3:landmarks"]:
268 | tmp_lst.append(list(x))
269 | res["tier3:landmarks"] = tmp_lst
270 | if "tier3:traceroutes" in res:
271 | tmp_lst = []
272 | for x in res["tier3:traceroutes"]:
273 | tmp_lst.append(list(x))
274 | res["tier3:traceroutes"] = tmp_lst
275 | return res
276 |
--------------------------------------------------------------------------------
/scripts/utils/clickhouse.py:
--------------------------------------------------------------------------------
1 | """clickhouse client"""
2 |
3 | import subprocess
4 |
5 | from pathlib import Path
6 | from clickhouse_driver import Client
7 |
8 | from logger import logger
9 | from default import (
10 | CLICKHOUSE_HOST,
11 | CLICKHOUSE_DB,
12 | CLICKHOUSE_USER,
13 | CLICKHOUSE_PASSWORD,
14 | CLICKHOUSE_CLIENT,
15 | )
16 |
17 |
18 | class Clickhouse:
19 | def __init__(
20 | self,
21 | host: str = CLICKHOUSE_HOST,
22 | database: str = CLICKHOUSE_DB,
23 | user: str = CLICKHOUSE_USER,
24 | password: str = CLICKHOUSE_PASSWORD,
25 | client_path: Path = CLICKHOUSE_CLIENT,
26 | ) -> None:
27 | self.host = host
28 | self.database = database
29 | self.user = user
30 | self.password = password
31 | self.client_path = client_path
32 |
33 | self.client: Client = Client(
34 | host=self.host, user=self.user, password=self.password
35 | )
36 |
37 | self.settings = {"max_block_size": 100000}
38 |
39 | def get_min_rtt_per_src_dst_query(
40 | self, table: str, filter: str, threshold=10000
41 | ) -> str:
42 | return f"""
43 | WITH arrayMin(groupArray(`min`)) as min_rtt
44 | SELECT IPv4NumToString(dst), IPv4NumToString(src), min_rtt
45 | FROM {self.database}.{table}
46 | WHERE `min` > -1 AND `min`< {threshold} AND dst != src {filter}
47 | GROUP BY (dst, src)
48 | """
49 |
50 | def get_min_rtt_per_src_dst_prefix_query(
51 | self, table: str, filter: str, threshold=10000
52 | ) -> str:
53 | return f"""
54 | WITH arrayMin(groupArray(`min`)) as min_rtt
55 | SELECT IPv4NumToString(dst_prefix), IPv4NumToString(src), min_rtt
56 | FROM {self.database}.{table}
57 | WHERE `min` > -1 AND `min`< {threshold}
58 | AND dst_prefix != toIPv4(substring(cutIPv6(IPv4ToIPv6(src), 0, 1), 8))
59 | {filter}
60 | GROUP BY dst_prefix, src
61 | """
62 |
63 | def get_all_rtt_to_dst_address_query(self, table: str, target: str) -> str:
64 | return f"""
65 | SELECT src_addr, rtt, tstamp
66 | FROM {self.database}.{table}
67 | WHERE resp_addr = '{target}' AND dst_addr = '{target}'
68 | """
69 |
70 | def get_all_rtt_from_probe_to_targets_query(
71 | self, table: str, src: str, target1: str, target2: str
72 | ) -> str:
73 | return f"""
74 | SELECT resp_addr, dst_addr, rtt
75 | FROM {self.database}.{table}
76 | WHERE src_addr = '{src}' and (dst_addr = '{target1}' or dst_addr = '{target2}')
77 | """
78 |
79 | def insert_street_lvl_traceroutes_query(self, table: str) -> str:
80 | return f"""
81 | INSERT
82 | INTO {self.database}.{table} (
83 | src_addr, dst_prefix, dst_addr, resp_addr,
84 | proto, hop, rtt, ttl, prb_id, msm_id, tstamp
85 | ) VALUES
86 | """
87 |
88 | def insert_native_query(self, table: str, infile_path: Path) -> str:
89 | """insert data using local clickhouse file"""
90 | return f"""
91 | INSERT INTO {self.database}.{table}
92 | FROM INFILE '{str(infile_path)}'
93 | FORMAT Native"""
94 |
95 | def insert_csv_query(self, table: str, infile_path: Path) -> str:
96 | """insert data from csv file"""
97 | return f"""
98 | INSERT INTO {self.database}.{table}
99 | FROM INFILE '{str(infile_path)}'
100 | FORMAT CSV
101 | """
102 |
103 | def insert_file(self, query: str) -> None:
104 | """execute clickhouse insert query as not supported by clickhouse-driver"""
105 | cmd = f"{str(self.client_path)} client"
106 |
107 | if self.password is not None and self.password != "":
108 | cmd += f"--password={self.password}"
109 | cmd += f' --query="{query}"'
110 |
111 | logger.info(f"executing query: {cmd}")
112 |
113 | ps = subprocess.run(cmd, shell=True, capture_output=True, text=True)
114 |
115 | if ps.stderr:
116 | raise RuntimeError(
117 | f"Could not insert data::{cmd}, failed with error: {ps.stderr}"
118 | )
119 | else:
120 | logger.info(f"{cmd}::Successfully executed")
121 |
122 | def execute(self, query: str, arg_lst=[]) -> None:
123 | """execute query using clickhouse driver"""
124 | if arg_lst == []:
125 | return self.client.execute(query, settings=self.settings)
126 | else:
127 | return self.client.execute(query, arg_lst, settings=self.settings)
128 |
129 | def insert_from_values_query(self, table: str, values_description: str) -> str:
130 | """insert data from csv file"""
131 | return f"""
132 | INSERT INTO {self.database}.{table}
133 | ({values_description})
134 | VALUES
135 | """
136 |
137 | def insert_from_values(self, query: str, data: list) -> None:
138 | return self.client.execute(query, data, settings=self.settings)
139 |
140 | def execute_iter(self, query: str) -> None:
141 | """use clickhouse driver instead of subprocess"""
142 | return self.client.execute_iter(query, settings=self.settings)
143 |
144 | def create_prefixes_ping_tables(self, table_name: str) -> str:
145 | """create all ping tables"""
146 | return f"""
147 | CREATE TABLE IF NOT EXISTS {self.database}.{table_name}
148 | (
149 | `src` IPv4,
150 | `dst` IPv4,
151 | `dst_prefix` IPv4 MATERIALIZED toIPv4(substring(cutIPv6(IPv4ToIPv6(dst), 0, 1), 8)),
152 | `prb_id` UInt32,
153 | `date` DateTime,
154 | `sent` UInt32,
155 | `rcvd` UInt32,
156 | `rtts` Array(Float64),
157 | `min` Float64,
158 | `mean` Float64,
159 | `msm_id` UInt64,
160 | `proto` UInt8
161 | )
162 | ENGINE=MergeTree()
163 | ORDER BY (dst_prefix, dst, src, msm_id, date)
164 | """
165 |
166 | def create_target_ping_tables(self, table_name: str) -> str:
167 | """create table"""
168 | return f"""
169 | CREATE TABLE IF NOT EXISTS {self.database}.{table_name}
170 | (
171 | `src` IPv4,
172 | `dst` IPv4,
173 | `prb_id` UInt32,
174 | `date` DateTime,
175 | `sent` UInt32,
176 | `rcvd` UInt32,
177 | `rtts` Array(Float64),
178 | `min` Float64,
179 | `mean` Float64,
180 | `msm_id` UInt64,
181 | `proto` UInt8
182 | )
183 | ENGINE=MergeTree()
184 | ORDER BY (dst, src, msm_id, date)
185 | """
186 |
187 | def create_traceroutes_table(self, table_name: str) -> str:
188 | return f"""
189 | CREATE TABLE IF NOT EXISTS {self.database}.{table_name}
190 | (
191 | `src_ip` String,
192 | `dst_prefix` String,
193 | `dst_ip` String,
194 | `reply_ip` String,
195 | `proto` Int16,
196 | `hop` Int16,
197 | `rtt` Float64,
198 | `ttl` Int16,
199 | `prb_id` Int64,
200 | `msm_id` Int64,
201 | `timestamp` DateTime('UTC')
202 | )
203 | ENGINE=MergeTree()
204 | ORDER BY (dst_prefix, dst_ip, src_ip, reply_ip)
205 | """
206 |
207 | def create_street_level_table(self, table_name: str) -> str:
208 | """create the street level traceroute table"""
209 |
210 | return f"""
211 | CREATE TABLE IF NOT EXISTS {self.database}.{table_name}
212 | (
213 | `src_addr` String,
214 | `dst_prefix` String,
215 | `dst_addr` String,
216 | `resp_addr` String,
217 | `proto` Int16,
218 | `hop` Int16,
219 | `rtt` Float64,
220 | `ttl` Int16,
221 | `prb_id` Int64,
222 | `msm_id` Int64,
223 | `tstamp` Datetime('UTC')
224 | )
225 | ENGINE = MergeTree()
226 | ORDER BY (dst_addr, src_addr, tstamp)
227 | """
228 |
--------------------------------------------------------------------------------
/scripts/utils/clickhouse_installer.py:
--------------------------------------------------------------------------------
1 | """clickhouse client"""
2 |
3 | from scripts.utils.clickhouse import Clickhouse
4 | from logger import logger
5 |
6 | from default import *
7 |
8 |
9 | if __name__ == "__main__":
10 | clickhouse_driver = Clickhouse()
11 |
12 | ##################################################################################################
13 | # CREATE REPRO TABLES #
14 | ##################################################################################################
15 |
16 | # create anchors_meshed_table
17 | query = clickhouse_driver.create_target_ping_tables(ANCHORS_MESHED_PING_TABLE)
18 | clickhouse_driver.execute(query)
19 | logger.info(f"table {ANCHORS_MESHED_PING_TABLE} created")
20 |
21 | query = clickhouse_driver.create_target_ping_tables(PROBES_TO_ANCHORS_PING_TABLE)
22 | clickhouse_driver.execute(query)
23 | logger.info(f"table {PROBES_TO_ANCHORS_PING_TABLE} created")
24 |
25 | # create prefixes ping table
26 | query = clickhouse_driver.create_prefixes_ping_tables(ANCHORS_TO_PREFIX_TABLE)
27 | clickhouse_driver.execute(query)
28 | logger.info(f"table {ANCHORS_TO_PREFIX_TABLE} created")
29 |
30 | query = clickhouse_driver.create_prefixes_ping_tables(PROBES_TO_PREFIX_TABLE)
31 | clickhouse_driver.execute(query)
32 | logger.info(f"table {PROBES_TO_PREFIX_TABLE} created")
33 |
34 | query = clickhouse_driver.create_prefixes_ping_tables(
35 | TARGET_TO_LANDMARKS_PING_TABLE
36 | )
37 | clickhouse_driver.execute(query)
38 | logger.info(f"table {TARGET_TO_LANDMARKS_PING_TABLE} created")
39 |
40 | # create traceroute table
41 | query = clickhouse_driver.create_traceroutes_table(ANCHORS_MESHED_TRACEROUTE_TABLE)
42 | clickhouse_driver.execute(query)
43 | logger.info(f"table {ANCHORS_MESHED_TRACEROUTE_TABLE} created")
44 |
45 | # Create street level db
46 | query = clickhouse_driver.create_street_level_table(STREET_LEVEL_TRACEROUTES_TABLE)
47 | clickhouse_driver.execute(query)
48 | logger.info(f"table {STREET_LEVEL_TRACEROUTES_TABLE} created")
49 |
50 | ##################################################################################################
51 | # INSERT REPRO DATA #
52 | ##################################################################################################
53 |
54 | # table names
55 | tables = [
56 | ANCHORS_MESHED_TRACEROUTE_TABLE,
57 | PROBES_TO_ANCHORS_PING_TABLE,
58 | ANCHORS_TO_PREFIX_TABLE,
59 | PROBES_TO_PREFIX_TABLE,
60 | ANCHORS_MESHED_PING_TABLE,
61 | TARGET_TO_LANDMARKS_PING_TABLE,
62 | STREET_LEVEL_TRACEROUTES_TABLE,
63 | ]
64 |
65 | # measurements files_path
66 | file_paths = [
67 | ANCHORS_MESHED_TRACEROUTE_FILE,
68 | PROBES_TO_ANCHORS_PING_FILE,
69 | ANCHORS_TO_PREFIX_FILE,
70 | PROBES_TO_PREFIX_FILE,
71 | ANCHORS_MESHED_PING_FILE,
72 | TARGET_TO_LANDMARKS_PING_FILE,
73 | STREET_LEVEL_TRACEROUTES_FILE,
74 | ]
75 |
76 | for table_name, file_path in zip(tables, file_paths):
77 | logger.info(f"inserting data into {table_name} from {file_path}")
78 | insert_query = clickhouse_driver.insert_native_query(table_name, file_path)
79 |
80 | clickhouse_driver.insert_file(insert_query)
81 |
82 | ##################################################################################################
83 | # CREATE USER MEASUREMENT TABLES #
84 | ##################################################################################################
85 |
86 | query = clickhouse_driver.create_target_ping_tables(USER_VPS_TO_TARGET_TABLE)
87 | clickhouse_driver.execute(query)
88 | logger.info(f"table {USER_VPS_TO_TARGET_TABLE} created")
89 |
90 | query = clickhouse_driver.create_target_ping_tables(USER_MESHED_TABLE)
91 | clickhouse_driver.execute(query)
92 | logger.info(f"table {USER_MESHED_TABLE} created")
93 |
94 | # create prefixes ping table
95 | query = clickhouse_driver.create_prefixes_ping_tables(USER_VPS_TO_PREFIX_TABLE)
96 | clickhouse_driver.execute(query)
97 | logger.info(f"table {USER_VPS_TO_PREFIX_TABLE} created")
98 |
99 | query = clickhouse_driver.create_prefixes_ping_tables(
100 | USER_TARGET_TO_LANDMARKS_PING_TABLE
101 | )
102 | clickhouse_driver.execute(query)
103 | logger.info(f"table {USER_TARGET_TO_LANDMARKS_PING_TABLE} created")
104 |
105 | # create traceroute table
106 | query = clickhouse_driver.create_traceroutes_table(
107 | USER_ANCHORS_MESHED_TRACEROUTE_TABLE
108 | )
109 | clickhouse_driver.execute(query)
110 | logger.info(f"table {USER_ANCHORS_MESHED_TRACEROUTE_TABLE} created")
111 |
112 | # Create street level db
113 | query = clickhouse_driver.create_street_level_table(
114 | USER_STREET_LEVEL_TRACEROUTES_TABLE
115 | )
116 | clickhouse_driver.execute(query)
117 | logger.info(f"table {USER_STREET_LEVEL_TRACEROUTES_TABLE} created")
118 |
--------------------------------------------------------------------------------
/scripts/utils/credentials.py:
--------------------------------------------------------------------------------
1 | """get all credentials (Clickhouse and RIPE)"""
2 |
3 | import json
4 | import os
5 |
6 | from logger import logger
7 | from dotenv import load_dotenv
8 |
9 | load_dotenv()
10 |
11 |
12 | def get_clickhouse_credentials() -> dict:
13 | """return clickhouse credentials"""
14 |
15 | # try to get credentials with env var directly
16 | try:
17 | return {
18 | "base_url": os.environ["CLICKHOUSE_BASE_URL"],
19 | "user": os.environ["CLICKHOUSE_USER"],
20 | "password": os.environ["CLICKHOUSE_PASSWORD"],
21 | }
22 |
23 | except KeyError as e:
24 | logger.error(
25 | f"Missing credentials for interacting with IRIS API (set: CLICKHOUSE_BASE_URL | CLICKHOUSE_USERNAME | CLICKHOUSE_PASSWORD): {e}"
26 | )
27 |
28 |
29 | def get_ripe_atlas_credentials() -> dict:
30 | """return ripe credentials"""
31 | try:
32 | return {
33 | "username": os.environ["RIPE_USERNAME"],
34 | "secret_key": os.environ["RIPE_SECRET_KEY"],
35 | }
36 |
37 | except KeyError as e:
38 | logger.error(
39 | f"Missing credentials for interacting with IRIS API (set: CLICKHOUSE_BASE_URL | CLICKHOUSE_USERNAME | CLICKHOUSE_PASSWORD): {e}"
40 | )
41 |
--------------------------------------------------------------------------------
/scripts/utils/file_utils.py:
--------------------------------------------------------------------------------
1 | """Functions to load and save data into a json format.
2 | All the paths are given in default.py file.
3 | """
4 | import ujson as json
5 |
6 | from pathlib import Path
7 |
8 |
9 | def load_json(file_path: Path):
10 | # check that dirs exits
11 | if not file_path.parent.exists():
12 | file_path.parent.mkdir(parents=True, exist_ok=True)
13 |
14 | with open(file_path) as f:
15 | return json.load(f)
16 |
17 |
18 | def dump_json(data, file_path: Path):
19 | """dump data to output file"""
20 | # check that dirs exits
21 | if not file_path.parent.exists():
22 | file_path.parent.mkdir(parents=True, exist_ok=True)
23 |
24 | with open(file_path, "w") as f:
25 | json.dump(data, f, indent=4)
26 |
27 |
28 | # def append_results(data, file_path: Paths)
29 |
--------------------------------------------------------------------------------
/scripts/utils/helpers.py:
--------------------------------------------------------------------------------
1 | # Mathematical functions helpful for geolocation problems
2 |
3 | import itertools
4 | import numpy as np
5 |
6 | from math import asin, cos, log, radians, sin, sqrt, pi
7 |
8 |
9 | def internet_speed(rtt, speed_threshold):
10 | if speed_threshold is not None:
11 | return speed_threshold
12 |
13 | if rtt >= 80:
14 | speed_threshold = 4 / 9
15 | if rtt >= 5 and rtt < 80:
16 | speed_threshold = 3 / 9
17 | if rtt >= 0 and rtt < 5:
18 | speed_threshold = 1 / 6
19 |
20 | return speed_threshold
21 |
22 |
23 | def rtt_to_km(rtt, speed_threshold=None, c=300):
24 | return internet_speed(rtt, speed_threshold) * rtt * c / 2
25 |
26 |
27 | def is_within_cirle(vp_geo, rtt, candidate_geo, speed_threshold=None):
28 | d = rtt_to_km(rtt, speed_threshold)
29 | d_vp_candidate = haversine(vp_geo, candidate_geo)
30 | if d < d_vp_candidate:
31 | return False
32 | else:
33 | return True
34 |
35 |
36 | def geo_to_cartesian(lat, lon):
37 | lat *= np.pi / 180
38 | lon *= np.pi / 180
39 |
40 | x = np.cos(lon) * np.cos(lat)
41 | y = np.sin(lon) * np.cos(lat)
42 | z = np.sin(lat)
43 |
44 | return x, y, z
45 |
46 |
47 | def check_circle_inclusion(c_1, c_2):
48 | lat_1, lon_1, rtt_1, d_1, r_1 = c_1
49 | lat_2, lon_2, rtt_2, d_2, r_2 = c_2
50 | d = haversine((lat_1, lon_1), (lat_2, lon_2))
51 | if d_1 > (d + d_2):
52 | return c_1, c_2
53 | elif d_2 > (d + d_1):
54 | return c_2, c_1
55 | return None, None
56 |
57 |
58 | def circle_preprocessing(circles, speed_threshold=None):
59 | circles_to_ignore = set()
60 |
61 | circles_with_r_info = []
62 | for c in circles:
63 | lat, lon, rtt, d, r = c
64 | if d is None:
65 | d = rtt_to_km(rtt, speed_threshold)
66 | if r is None:
67 | r = d / 6371
68 | circles_with_r_info.append((lat, lon, rtt, d, r))
69 |
70 | for i in range(len(circles_with_r_info)):
71 | c_1 = circles_with_r_info[i]
72 | if c_1 in circles_to_ignore:
73 | continue
74 | lat_1, lon_1, rtt_1, d_1, r_1 = c_1
75 | for j in range(i + 1, len(circles_with_r_info)):
76 | c_2 = circles_with_r_info[j]
77 | if c_2 in circles_to_ignore:
78 | continue
79 | lat_2, lon_2, rtt_2, d_2, r_2 = c_2
80 | remove, keep = check_circle_inclusion(
81 | (lat_1, lon_1, rtt_1, d_1, r_1), (lat_2, lon_2, rtt_2, d_2, r_2)
82 | )
83 | if remove:
84 | circles_to_ignore.add(remove)
85 |
86 | circles_to_keep = set(circles_with_r_info) - circles_to_ignore
87 |
88 | return circles_to_keep
89 |
90 |
91 | def get_points_on_circle(lat_c, lon_c, r_c, nb_points: int = 4):
92 | """from a circle, return a set of points"""
93 | circle_points = []
94 | for k in range(nb_points):
95 | # compute
96 | angle = pi * 2 * k / nb_points
97 | dx = r_c * 1000 * cos(angle)
98 | dy = r_c * 1000 * sin(angle)
99 | lat = lat_c + (180 / pi) * (dy / 6378137)
100 | lon = lon_c + (180 / pi) * (dx / 6378137) / cos(lat_c * pi / 180)
101 |
102 | circle_points.append((lat, lon))
103 |
104 | return circle_points
105 |
106 |
107 | def circle_intersections(circles, speed_threshold=None):
108 | """
109 | Check out this link for more details about the maths:
110 | https://gis.stackexchange.com/questions/48937/calculating-intersection-of-two-circles
111 | """
112 | intersect_points = []
113 |
114 | circles = circle_preprocessing(circles, speed_threshold=speed_threshold)
115 |
116 | if len(circles) == 1:
117 | single_circle = list(circles)[0]
118 | lat, lon, rtt, d, r = single_circle
119 | filtered_points = get_points_on_circle(lat, lon, d)
120 | return filtered_points, circles
121 |
122 | for c_1, c_2 in itertools.combinations(circles, 2):
123 | lat_1, lon_1, rtt_1, d_1, r_1 = c_1
124 | lat_2, lon_2, rtt_2, d_2, r_2 = c_2
125 |
126 | x1 = np.array(list(geo_to_cartesian(lat_1, lon_1)))
127 | x2 = np.array(list(geo_to_cartesian(lat_2, lon_2)))
128 |
129 | q = np.dot(x1, x2)
130 |
131 | a = (np.cos(r_1) - np.cos(r_2) * q) / (1 - (q**2))
132 | b = (np.cos(r_2) - np.cos(r_1) * q) / (1 - (q**2))
133 |
134 | x0 = a * x1 + b * x2
135 |
136 | n = np.cross(x1, x2)
137 | if (1 - np.dot(x0, x0)) / np.dot(n, n) <= 0:
138 | # print("ANYCAST???", (lat_1, lon_1, rtt_1, d_1), (lat_2, lon_2, rtt_2, d_2))
139 | continue
140 |
141 | t = np.sqrt((1 - np.dot(x0, x0)) / np.dot(n, n))
142 |
143 | i1 = x0 + t * n
144 | i2 = x0 - t * n
145 |
146 | i_lon_1 = np.arctan2(i1[1], i1[0]) * (180 / np.pi)
147 | i_lat_1 = np.arctan(i1[2] / np.sqrt((i1[0] ** 2) + (i1[1] ** 2))) / (
148 | np.pi / 180
149 | )
150 | intersect_points.append((i_lat_1, i_lon_1))
151 |
152 | i_lon_2 = np.arctan2(i2[1], i2[0]) * (180 / np.pi)
153 | i_lat_2 = np.arctan(i2[2] / np.sqrt((i2[0] ** 2) + (i2[1] ** 2))) / (
154 | np.pi / 180
155 | )
156 | intersect_points.append((i_lat_2, i_lon_2))
157 |
158 | filtred_points = []
159 | for point_geo in intersect_points:
160 | for lat_c, long_c, rtt_c, d_c, r_c in circles:
161 | if not is_within_cirle((lat_c, long_c), rtt_c, point_geo, speed_threshold):
162 | break
163 | else:
164 | filtred_points.append(point_geo)
165 |
166 | return filtred_points, circles
167 |
168 |
169 | def polygon_centroid(points):
170 | """
171 | Compute polygon centroid using Finit Set of point method.
172 | (see https://en.wikipedia.org/wiki/Centroid#Of_a_finite_set_of_points)
173 | """
174 | x = 0
175 | y = 0
176 | for point in points:
177 | x += point[0]
178 | y += point[1]
179 | return x / len(points), y / len(points)
180 |
181 |
182 | def haversine(input_location, block_location):
183 | """Distance between two locations in earth."""
184 | in_lat, in_lon, block_lat, block_lon = map(
185 | np.radians, [*input_location, *block_location]
186 | )
187 |
188 | dlat = block_lat - in_lat
189 | dlon = block_lon - in_lon
190 |
191 | distances = (
192 | np.sin(dlat / 2.0) ** 2
193 | + np.cos(in_lat) * np.cos(block_lat) * np.sin(dlon / 2.0) ** 2
194 | )
195 |
196 | return 6367 * 2 * np.arcsin(np.sqrt(distances))
197 |
198 |
199 | def distance(lat1, lat2, lon1, lon2):
200 | lon1 = radians(lon1)
201 | lon2 = radians(lon2)
202 | lat1 = radians(lat1)
203 | lat2 = radians(lat2)
204 |
205 | # Haversine formula
206 | dlon = lon2 - lon1
207 | dlat = lat2 - lat1
208 | a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
209 |
210 | c = 2 * asin(sqrt(a))
211 |
212 | r = 6371
213 |
214 | return c * r
215 |
216 |
217 | def get_middle_intersection(intersections):
218 | """in case of only two intersection points, return the middle segment"""
219 | (lat1, lon1) = intersections[0]
220 | (lat2, lon2) = intersections[1]
221 |
222 | # convert to radians
223 | lon1 = radians(lon1)
224 | lon2 = radians(lon2)
225 | lat1 = radians(lat1)
226 | lat2 = radians(lat2)
227 |
228 | # calculate the middle of two points
229 | Bx = np.cos(lat2) * np.cos(lon2 - lon1)
230 | By = np.cos(lat2) * np.sin(lon2 - lon1)
231 | latMid = np.arctan2(
232 | np.sin(lat1) + np.sin(lat2),
233 | np.sqrt((np.cos(lat1) + Bx) * (np.cos(lat1) + Bx) + By * By),
234 | )
235 | lonMid = lon1 + np.arctan2(By, np.cos(lat1) + Bx)
236 |
237 | # convert back to degrees
238 | latMid = latMid * (180 / pi)
239 | lonMid = lonMid * (180 / pi)
240 |
241 | return latMid, lonMid
242 |
243 |
244 | def select_best_guess_centroid(target_ip, vp_coordinates_per_ip, rtt_per_vp_to_target):
245 | """
246 | Find the best guess
247 | that is the location of the vantage point closest to the centroid.
248 | """
249 | probe_circles = {}
250 | closest_vp = None
251 | min_rtt_per_vp_ip = {}
252 | for vp_ip, rtts in rtt_per_vp_to_target.items():
253 | if target_ip == vp_ip:
254 | continue
255 | if vp_ip not in vp_coordinates_per_ip:
256 | continue
257 | lat, lon = vp_coordinates_per_ip[vp_ip]
258 | min_rtt = min(rtts)
259 | if min_rtt > 100:
260 | continue
261 | min_rtt_per_vp_ip[vp_ip] = min_rtt
262 | # too inflated RTT means that measurement will not provide useful info
263 |
264 | if isinstance(min_rtt, float):
265 | probe_circles[vp_ip] = (
266 | lat,
267 | lon,
268 | min_rtt,
269 | None,
270 | None,
271 | )
272 | # print(f"vp_anchor = {vp_ip} with results: {min_rtt}")
273 | # print()
274 |
275 | # draw circles
276 | if not probe_circles:
277 | return None
278 | circles = list(probe_circles.values())
279 | intersections, circles = circle_intersections(circles, speed_threshold=2/3)
280 | if len(intersections) > 2:
281 | centroid = polygon_centroid(intersections)
282 | elif len(intersections) == 2:
283 | # only two circles intersection, centroid is middle of the segment
284 | centroid = get_middle_intersection(intersections)
285 | else:
286 | # only one circle so take the closest vp as the centroid
287 | closest_vp, _ = min(min_rtt_per_vp_ip.items(), key=lambda x: x[1])
288 | centroid = vp_coordinates_per_ip[closest_vp]
289 |
290 | return centroid, circles
291 |
292 |
293 | def get_center_of_poly(circles, speed):
294 | points, circles = circle_intersections(circles, speed)
295 | if len(points) == 0:
296 | return None, None
297 | return polygon_centroid(points)
298 |
299 |
300 | def get_points_in_poly(circles, rot, rad, speed, old_circles=[]):
301 | circles = circle_preprocessing(circles, speed_threshold=speed)
302 | points, circles = circle_intersections(circles, speed)
303 | if len(points) == 0:
304 | return []
305 | else:
306 | center = polygon_centroid(points)
307 | res = [center]
308 | iter_rad = 0
309 | points_added = True
310 | while points_added:
311 | iter_rad += rad
312 | points_added = False
313 | to_add_points = get_points_on_circle(
314 | center[0], center[1], iter_rad, int(360/rot))
315 | for point in to_add_points:
316 | all_in = True
317 | for vp in circles:
318 | if not is_within_cirle((vp[0], vp[1]), vp[2], point, speed):
319 | all_in = False
320 | break
321 | if all_in:
322 | for vp in old_circles:
323 | if not is_within_cirle((vp[0], vp[1]), vp[2], point, speed):
324 | all_in = False
325 | break
326 | if all_in:
327 | points_added = True
328 | res.append(point)
329 | return res
330 |
331 | def greedy_selection_probes_impl(probe, distance_per_probe, selected_probes):
332 |
333 | distances_log = [log(distance_per_probe[p]) for p in selected_probes
334 | if p in distance_per_probe and distance_per_probe[p] > 0]
335 | total_distance = sum(distances_log)
336 | return probe, total_distance
337 |
338 |
--------------------------------------------------------------------------------
/scripts/utils/measurement_utils.py:
--------------------------------------------------------------------------------
1 | """functions for running measurements"""
2 |
3 | import random
4 | import time
5 |
6 | from datetime import datetime
7 | from uuid import UUID
8 | from pathlib import Path
9 | from dateutil import parser
10 |
11 | from logger import logger
12 | from scripts.utils.file_utils import load_json, dump_json
13 | from scripts.ripe_atlas.atlas_api import get_prefix_from_ip, get_measurements_from_tag
14 | from scripts.ripe_atlas.ping_and_traceroute_classes import PING
15 | from scripts.utils.clickhouse import Clickhouse
16 |
17 | from default import (
18 | PREFIX_MEASUREMENT_RESULTS,
19 | TARGET_MEASUREMENT_RESULTS,
20 | USER_VPS_TO_PREFIX_TABLE,
21 | USER_VPS_TO_TARGET_TABLE,
22 | )
23 |
24 |
25 | def load_targets(target_file_path: Path, nb_target: int = -1) -> list:
26 | """get a file as entry, return a list of ip target"""
27 | targets = load_json(target_file_path)
28 |
29 | if nb_target > len(targets) or nb_target < 0:
30 | nb_target = len(targets)
31 |
32 | subset_targets = random.sample(targets, k=nb_target)
33 |
34 | return subset_targets
35 |
36 |
37 | def load_vps(vps_file_path: Path, nb_vps: int = -1) -> list:
38 | """load vps from file, return list of vps"""
39 | vps = load_json(vps_file_path)
40 |
41 | if nb_vps > len(vps) or nb_vps < 0:
42 | nb_vps = len(vps)
43 |
44 | subset_vps = random.sample(vps, k=nb_vps)
45 |
46 | return subset_vps
47 |
48 |
49 | def get_measurement_config(
50 | experiment_uuid: UUID,
51 | prefix_measurement_uuid: UUID,
52 | target_measurement_uuid: UUID,
53 | targets: list,
54 | target_prefixes: list,
55 | vps: dict,
56 | dry_run=False,
57 | ) -> dict:
58 | """return measurement config for future retrieval"""
59 | return {
60 | "experiment_uuid": str(experiment_uuid),
61 | "status": "ongoing",
62 | "start_time": str(datetime.now()),
63 | "end_time": None,
64 | "is_dry_run": dry_run,
65 | "nb_targets": len(targets),
66 | "nb_vps": len(vps),
67 | "description": "measurements from a set of vps towards all targets/target prefixes",
68 | "af": 4,
69 | "target_measurements": {
70 | "measurement_uuid": str(target_measurement_uuid),
71 | "targets": targets,
72 | "vps": vps,
73 | "end_time": None,
74 | },
75 | "prefix_measurements": {
76 | "measurement_uuid": str(prefix_measurement_uuid),
77 | "targets": target_prefixes,
78 | "vps": vps,
79 | "end_time": None,
80 | },
81 | }
82 |
83 |
84 | def save_measurement_config(measurement_config: dict, out_path: Path) -> None:
85 | """save measurement config"""
86 |
87 | try:
88 | if (
89 | measurement_config["prefix_measurements"]["end_time"] is not None
90 | and measurement_config["target_measurements"]["end_time"] is not None
91 | ):
92 | measurement_config["end_time"] = str(datetime.now())
93 | measurement_config["status"] = "finished"
94 | except KeyError:
95 | pass
96 |
97 | dump_json(measurement_config, out_path)
98 |
99 |
100 | def get_target_prefixes(targets: list) -> list:
101 | """from a set of targets ip addresses return their /24 prefixes"""
102 | return [get_prefix_from_ip(target_addr) for target_addr in targets]
103 |
104 |
105 | def ping_prefixes(
106 | measurement_uuid: UUID,
107 | measurement_config: dict,
108 | target_prefixes: list,
109 | targets_per_prefix: dict[list],
110 | vps: list[dict],
111 | dry_run: bool = False,
112 | use_cache: bool = True,
113 | cache_file: Path = PREFIX_MEASUREMENT_RESULTS,
114 | ) -> None:
115 | """ping all targets prefixes from all vps"""
116 |
117 | pinger = PING()
118 |
119 | try:
120 | # load cached prefix results in case measurement was interrupted
121 | if use_cache:
122 | cached_results = load_json(cache_file)
123 |
124 | if cached_results:
125 | logger.info(
126 | f"initial length targets: {len(targets_per_prefix)}, cached measurements : {len(cached_results)}"
127 | )
128 |
129 | # get prefixes out of targets
130 | cached_results = [
131 | get_prefix_from_ip(target["dst_addr"]) for target in cached_results
132 | ]
133 | for subnet in cached_results:
134 | if subnet not in targets_per_prefix:
135 | continue
136 | targets_per_prefix.pop(subnet)
137 |
138 | logger.info(
139 | f"after removing cached: {len(targets_per_prefix)}, cached measurements : {len(cached_results)}"
140 | )
141 | except FileNotFoundError:
142 | logger.info("No cached results available")
143 | pass
144 |
145 | logger.info(
146 | f"Starting measurements {str(measurement_uuid)} with parameters: dry_run={dry_run}; nb_targets={len(target_prefixes)}; nb_vps={len(vps)}."
147 | )
148 |
149 | # measurement for 3 targets in every target prefixes
150 | ids, start_time, end_time = pinger.ping_by_prefix(
151 | target_prefixes=target_prefixes,
152 | vps=vps,
153 | targets_per_prefix=targets_per_prefix,
154 | tag=measurement_uuid,
155 | dry_run=dry_run,
156 | )
157 |
158 | # overwrite ids
159 | if "ids" in measurement_config["prefix_measurements"]:
160 | ids.extend(measurement_config["prefix_measurements"]["ids"])
161 |
162 | measurement_config["prefix_measurements"]["start_time"] = start_time
163 | measurement_config["prefix_measurements"]["end_time"] = end_time
164 |
165 |
166 | def ping_targets(
167 | measurement_uuid: UUID,
168 | measurement_config: dict,
169 | targets: list[dict],
170 | vps: list[dict],
171 | dry_run: bool = False,
172 | use_cache: bool = True,
173 | cache_file: Path = TARGET_MEASUREMENT_RESULTS,
174 | ) -> None:
175 | """ping all targets using all vps"""
176 |
177 | pinger = PING()
178 |
179 | targets = [t["address_v4"] for t in targets]
180 |
181 | try:
182 | if use_cache:
183 | cached_results = load_json(cache_file)
184 | logger.info(
185 | f"initial length targets: {len(targets)}, cached measurements : {len(cached_results)}"
186 | )
187 |
188 | cached_results = [c["dst_addr"] for c in cached_results]
189 |
190 | targets = list(set(targets).difference(set(cached_results)))
191 |
192 | logger.info(
193 | f"after removing cached: {len(targets)}, cached measurements : {len(cached_results)}"
194 | )
195 | except FileNotFoundError:
196 | logger.info("No cached results available")
197 | pass
198 |
199 | logger.info(
200 | f"Starting measurements {str(measurement_uuid)} with parameters: dry_run={dry_run}; nb_targets={len(targets)}; nb_vps={len(vps)}."
201 | )
202 |
203 | ids, start_time, end_time = pinger.ping_by_target(
204 | targets=targets, vps=vps, tag=measurement_uuid, dry_run=dry_run
205 | )
206 |
207 | # overwrite ids
208 | if "ids" in measurement_config["target_measurements"]:
209 | ids.extend(measurement_config["target_measurements"]["ids"])
210 |
211 | measurement_config["target_measurements"]["start_time"] = start_time
212 | measurement_config["target_measurements"]["end_time"] = end_time
213 |
214 |
215 | def get_latest_measurements(config_path: Path) -> dict:
216 | """retrieve latest measurement config"""
217 | try:
218 | assert config_path.is_dir()
219 | except AssertionError:
220 | logger.error(f"config path is not a dir: {config_path}")
221 |
222 | latest: datetime = None
223 | for file in config_path.iterdir():
224 | measurement_config = load_json(file)
225 | if latest:
226 | if latest < parser.isoparse(measurement_config["start_time"]):
227 | latest_config = measurement_config
228 | else:
229 | latest = parser.isoparse(measurement_config["start_time"])
230 | latest_config = measurement_config
231 |
232 | return latest_config
233 |
234 |
235 | def retrieve_results(
236 | measurement_uuid: str,
237 | out_file: Path,
238 | ) -> None:
239 | """query RIPE Atlas API to retrieve all measurement results"""
240 | # fetch results on API
241 | measurement_results = get_measurements_from_tag(measurement_uuid)
242 |
243 | logger.info(
244 | f"nb measurements retrieved: {len(measurement_results)} for measurement_uuid : {measurement_uuid}"
245 | )
246 |
247 | # save results in cache file
248 | dump_json(measurement_results, out_file)
249 |
250 | return measurement_results
251 |
252 |
253 | def insert_prefix_results(results: list) -> None:
254 | """insert prefixes results with CSV value method"""
255 | rows = []
256 | values_description = (
257 | "src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto"
258 | )
259 |
260 | if not results:
261 | raise RuntimeError(f"no data to insert, data = {result}")
262 |
263 | for result in results:
264 | try:
265 | # parse response
266 | src = result["src_addr"]
267 | dst = result["dst_addr"]
268 | prb_id = result["prb_id"]
269 | date = result["timestamp"]
270 | sent = result["sent"]
271 | rcvd = result["rcvd"]
272 | rtts = (
273 | [rtt["rtt"] for rtt in result["result"]]
274 | if "rtt" in result["result"]
275 | else [-1]
276 | )
277 | min = result["min"]
278 | mean = result["avg"]
279 | msm_id = result["msm_id"]
280 | proto = 0
281 |
282 | row = [src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto]
283 |
284 | rows.append(row)
285 | except KeyError as e:
286 | logger.warning(f"Some measurements does not contain results: {e}")
287 |
288 | clickhouse = Clickhouse()
289 | query = clickhouse.insert_from_values_query(
290 | USER_VPS_TO_PREFIX_TABLE, values_description
291 | )
292 | clickhouse.insert_from_values(query, rows)
293 |
294 | logger.info(
295 | f"Prefix measurements successfully inserted in table : {USER_VPS_TO_PREFIX_TABLE}"
296 | )
297 |
298 |
299 | def insert_target_results(results: list) -> None:
300 | """insert prefixes results with CSV value method"""
301 | rows = []
302 | values_description = (
303 | "src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto"
304 | )
305 | for result in results:
306 | # parse response
307 | src = result["src_addr"]
308 | dst = result["dst_addr"]
309 | prb_id = result["prb_id"]
310 | date = result["timestamp"]
311 | sent = result["sent"]
312 | rcvd = result["rcvd"]
313 | rtts = (
314 | [rtt["rtt"] for rtt in result["result"]]
315 | if "rtt" in result["result"]
316 | else [-1]
317 | )
318 | min = result["min"]
319 | mean = result["avg"]
320 | msm_id = result["msm_id"]
321 | proto = 0
322 |
323 | row = [src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto]
324 |
325 | rows.append(row)
326 |
327 | clickhouse = Clickhouse()
328 | query = clickhouse.insert_from_values_query(
329 | USER_VPS_TO_TARGET_TABLE, values_description
330 | )
331 | clickhouse.insert_from_values(query, rows)
332 |
333 | logger.info(
334 | f"Target measurements successfully inserted in table : {USER_VPS_TO_TARGET_TABLE}"
335 | )
336 |
--------------------------------------------------------------------------------
/scripts/utils/plot_utils.py:
--------------------------------------------------------------------------------
1 | """Functions to plot figures in a nice way"""
2 |
3 | from matplotlib.patches import Polygon
4 | from matplotlib.lines import Line2D
5 | from pathlib import Path
6 | import matplotlib.pyplot as plt
7 | import matplotlib
8 |
9 | matplotlib.use("Agg")
10 |
11 | font = {"weight": "bold", "size": 16} # 'family' : 'normal',
12 | matplotlib.rcParams["pdf.fonttype"] = 42
13 | matplotlib.rcParams["ps.fonttype"] = 42
14 | fontsize_axis = 17
15 | font_size_alone = 14
16 | matplotlib.rc("font", **font)
17 |
18 | markers = ["o", "s", "v", "^"]
19 | linestyles = ["-", "--", "-.", ":"]
20 |
21 | colors_blind = [
22 | ["blue", (0, 114.0 / 255, 178.0 / 255)],
23 | ["orange", (230.0 / 255, 159.0 / 255, 0)],
24 | ["reddish_purple", (204.0 / 255, 121.0 / 255, 167.0 / 255)],
25 | ["black", (0, 0, 0)],
26 | ["bluish_green", (0, 158.0 / 255, 115.0 / 255)],
27 | ["sky_blue", (86.0 / 255, 180.0 / 255, 233.0 / 255)],
28 | ["vermillon", (213.0 / 255, 94.0 / 255, 0)],
29 | # ["yellow", (240.0 / 255, 228.0 / 255, 66.0 / 255)],
30 | ]
31 |
32 |
33 | def plot_multiple_cdf(
34 | Ys,
35 | n_bins,
36 | xmin,
37 | xmax,
38 | xlabel,
39 | ylabel,
40 | legend,
41 | ymin=0,
42 | ymax=1.05,
43 | xticks=None,
44 | xticks_labels=None,
45 | xscale="linear",
46 | yscale="linear",
47 | cumulative=True,
48 | figure=None,
49 | axes=None,
50 | offset=0,
51 | colors_arg=None,
52 | linestyles_arg=None,
53 | ):
54 | if figure is not None and axes is not None:
55 | fig = figure
56 | ax = axes
57 | else:
58 | subplots = plt.subplots()
59 | fig, ax = subplots
60 | ax.set_xlabel(xlabel, fontsize=fontsize_axis)
61 | ax.set_ylabel(ylabel, fontsize=fontsize_axis)
62 | # title = title + " CDF"
63 | # plt.title("CDF", fontsize=fontsize_axis)
64 |
65 | ax.grid(linestyle="dotted")
66 | if len(Ys) == 1:
67 | i = 0
68 | Y = Ys[i]
69 | if colors_arg is not None:
70 | color = colors_arg[i][1]
71 | else:
72 | color = colors_blind[(i + offset) % len(colors_blind)][1]
73 |
74 | if linestyles_arg is not None:
75 | linestyle = linestyles[i]
76 | else:
77 | linestyle = linestyles[(i + offset) % len(linestyles)]
78 |
79 | n, bins, patches = ax.hist(
80 | Y,
81 | density=True,
82 | histtype="step",
83 | bins=n_bins,
84 | cumulative=cumulative,
85 | linewidth=1.35,
86 | color=color,
87 | linestyle=linestyle,
88 | )
89 | patches[0].set_xy(patches[0].get_xy()[1:-1])
90 | else:
91 | for i in range(0, len(Ys)):
92 | Y = Ys[i]
93 | if colors_arg is not None:
94 | color = colors_arg[i][1]
95 | else:
96 | color = colors_blind[(i + offset) % len(colors_blind)][1]
97 |
98 | if linestyles_arg is not None:
99 | linestyle = linestyles_arg[i]
100 | else:
101 | linestyle = linestyles[(i + offset) % len(linestyles)]
102 |
103 | n, bins, patches = ax.hist(
104 | Y,
105 | density=True,
106 | histtype="step",
107 | bins=n_bins,
108 | cumulative=cumulative,
109 | linewidth=1.35,
110 | label=legend[i],
111 | color=color,
112 | linestyle=linestyle,
113 | )
114 | patches[0].set_xy(patches[0].get_xy()[1:-1])
115 |
116 | # plt.xscale("symlog")
117 | # xticks = ax.xaxis.get_major_ticks()
118 | # xticks[1].label1.set_visible(False)
119 | # # xticks[2].label1.set_visible(False)
120 | # xticks[-2].label1.set_visible(False)
121 | ax.set_xscale(xscale)
122 | ax.set_yscale(yscale)
123 | ax.set_xlim(left=xmin, right=xmax)
124 | ax.set_ylim(bottom=ymin, top=ymax)
125 | if xticks is not None:
126 | ax.set_xticks(xticks)
127 | # xtickNames = plt.setp(ax, xticklabels=[f"{r}" for r in x_ticks])
128 | if xticks_labels is not None:
129 | ax.set_xticklabels(xticks_labels)
130 |
131 | # Normalize the data to a proper PDF
132 | # plt.tight_layout()
133 | # plt.savefig(r"resources/figures/" + ofile + ".pdf")
134 | return fig, ax
135 |
136 |
137 | def plot_multiple_error_bars(
138 | X, Ys, Yerrs, xmin, xmax, ymin, ymax, xlabel, ylabel, xscale, yscale, labels
139 | ):
140 | fig, ax = plt.subplots()
141 | ax.set_xlabel(xlabel, fontsize=fontsize_axis)
142 |
143 | ax.set_ylabel(ylabel, fontsize=fontsize_axis)
144 | ax.grid(linestyle="dotted")
145 |
146 | # x_ticks = [inf_born+1]
147 | for i in range(len(Ys)):
148 | Y = Ys[i]
149 | Yerr = Yerrs[i]
150 | lns1 = ax.errorbar(
151 | X,
152 | Y,
153 | Yerr,
154 | label=labels[i],
155 | linewidth=0.5,
156 | marker=markers[i % len(markers)],
157 | markersize=1,
158 | markeredgewidth=1,
159 | capsize=2,
160 | )
161 | ax.set_xscale(xscale)
162 | ax.set_yscale(yscale)
163 | ax.set_xlim(left=xmin, right=xmax)
164 | ax.set_ylim(bottom=ymin, top=ymax)
165 | return fig, ax
166 |
167 |
168 | def plot_save(ofile: Path, is_tight_layout):
169 | # check that dirs exits
170 | if not ofile.parent.exists():
171 | ofile.parent.mkdir(parents=True, exist_ok=True)
172 |
173 | if is_tight_layout:
174 | plt.tight_layout()
175 | # plt.show()
176 | plt.savefig(ofile)
177 |
178 | # plt.clf()
179 |
180 |
181 | def homogenize_legend(ax, legend_location, legend_size=14):
182 | handles, labels = ax.get_legend_handles_labels()
183 | new_handles = []
184 | for h in handles:
185 | if isinstance(h, Line2D):
186 | new_handles.append(h)
187 | elif isinstance(h, Polygon):
188 | new_handles.append(
189 | Line2D([], [], linestyle=h.get_linestyle(), color=h.get_edgecolor())
190 | )
191 | ax.legend(
192 | loc=legend_location,
193 | prop={"size": legend_size},
194 | handles=new_handles,
195 | labels=labels,
196 | )
197 |
198 |
199 | def plot_scatter_multiple(
200 | Xs,
201 | Ys,
202 | xmin,
203 | xmax,
204 | ymin,
205 | ymax,
206 | xscale,
207 | yscale,
208 | xlabel,
209 | ylabel,
210 | markers,
211 | marker_colors,
212 | marker_size,
213 | ):
214 | fig, ax = plt.subplots()
215 |
216 | # ax.set_xlabel(title, fontsize=fontsize_axis)
217 | # plt.title("CDF", fontsize=fontsize_axis)
218 |
219 | # x_ticks = [inf_born]
220 | # x_ticks.extend(np.arange(inf_born, sup_born, xtick_interval))
221 | # ax.set_xticks(x_ticks)
222 | # xtickNames = plt.setp(ax, xticklabels=["{0:.1f}".format(r) for r in x_ticks])
223 | # ax.set_xticklabels(xtickNames, rotation=45)
224 | # ax.set_xticklabels(xtickNames)
225 |
226 | ax.grid(linestyle="dotted")
227 | ax.set_xlabel(xlabel, fontsize=fontsize_axis)
228 | ax.set_ylabel(ylabel, fontsize=fontsize_axis)
229 |
230 | for i in range(0, len(Xs)):
231 | X = Xs[i]
232 | Y = Ys[i]
233 |
234 | # , markersize=10, markeredgewidth=2)
235 | ax.scatter(X, Y, c=marker_colors[i], marker=markers[i], s=marker_size[i])
236 | # ax.plot(X, Y)
237 | # patches[0].set_xy(patches[0].get_xy()[:-1])
238 | ax.set_xscale(xscale)
239 | ax.set_yscale(yscale)
240 |
241 | ax.set_xlim(left=xmin, right=xmax)
242 | ax.set_ylim(bottom=ymin, top=ymax)
243 |
244 | return fig, ax
245 |
--------------------------------------------------------------------------------