├── .env.example ├── .gitignore ├── README.md ├── analysis ├── million_scale.ipynb ├── million_scale.py ├── plot.ipynb ├── ripe_atlas_probes_bias.ipynb └── tables.ipynb ├── clickhouse_files ├── init-db.sh └── users.d │ └── default.xml ├── datasets └── create_datasets.ipynb ├── default.py ├── install.sh ├── logger.py ├── measurements ├── landmark_traceroutes.ipynb ├── million_scale_measurements.ipynb └── million_scale_measurements.py ├── poetry.lock ├── pyproject.toml └── scripts ├── analysis └── analysis.py ├── ripe_atlas ├── atlas_api.py └── ping_and_traceroute_classes.py ├── street_level ├── landmark.py ├── three_tiers.py └── traceroutes_results.py └── utils ├── clickhouse.py ├── clickhouse_installer.py ├── credentials.py ├── file_utils.py ├── helpers.py ├── measurement_utils.py └── plot_utils.py /.env.example: -------------------------------------------------------------------------------- 1 | RIPE_USERNAME= 2 | RIPE_SECRET_KEY= 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # output files 2 | *.fsdb 3 | *.pdf 4 | *.csv 5 | *.dat 6 | *.tif 7 | *.tree 8 | *.zst 9 | *.json 10 | *.dat 11 | *.txt 12 | 13 | measurements/results 14 | clickhouse_files/data/ 15 | clickhouse_files/logs/ 16 | clickhouse_files/clickhouse 17 | 18 | # Byte-compiled / optimized / DLL files 19 | __pycache__/ 20 | *.py[cod] 21 | *$py.class 22 | 23 | # C extensions 24 | *.so 25 | 26 | # Distribution / packaging 27 | .Python 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | cover/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | db.sqlite3-journal 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | .pybuilder/ 93 | target/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # IPython 99 | profile_default/ 100 | ipython_config.py 101 | 102 | # pyenv 103 | # For a library or package, you might want to ignore these files since the code is 104 | # intended to run in multiple environments; otherwise, check them in: 105 | # .python-version 106 | 107 | # pipenv 108 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 109 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 110 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 111 | # install all needed dependencies. 112 | #Pipfile.lock 113 | 114 | # poetry 115 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 116 | # This is especially recommended for binary packages to ensure reproducibility, and is more 117 | # commonly ignored for libraries. 118 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 119 | #poetry.lock 120 | 121 | # pdm 122 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 123 | #pdm.lock 124 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 125 | # in version control. 126 | # https://pdm.fming.dev/#use-with-ide 127 | .pdm.toml 128 | 129 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 130 | __pypackages__/ 131 | 132 | # Celery stuff 133 | celerybeat-schedule 134 | celerybeat.pid 135 | 136 | # SageMath parsed files 137 | *.sage.py 138 | 139 | # Environments 140 | .env 141 | .venv 142 | env/ 143 | venv/ 144 | ENV/ 145 | env.bak/ 146 | venv.bak/ 147 | 148 | # Spyder project settings 149 | .spyderproject 150 | .spyproject 151 | 152 | # Rope project settings 153 | .ropeproject 154 | 155 | # mkdocs documentation 156 | /site 157 | 158 | # mypy 159 | .mypy_cache/ 160 | .dmypy.json 161 | dmypy.json 162 | 163 | # Pyre type checker 164 | .pyre/ 165 | 166 | # pytype static type analyzer 167 | .pytype/ 168 | 169 | # Cython debug symbols 170 | cython_debug/ 171 | 172 | # PyCharm 173 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 174 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 175 | # and can be added to the global gitignore or merged into this file. For a more nuclear 176 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 177 | #.idea/ 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🗺️ Replication: Towards a Publicly Available Internet scale IP Geolocation Dataset (IMC 2023) 2 | This repository contains the code needed to reproduce and replicate our results in our [IMC 2023 paper](). 3 | 4 | Our study replicates the methodology of two papers that obtained outstanding results on geolocating IP addresses in terms of coverage and accuracy in nowadays Internet on the largest publicly available measurement platform, RIPE Atlas. 5 | These two papers are: 6 | 7 | 1. [Towards geolocation of millions of IP addresses (IMC 2012)](https://dl.acm.org/doi/abs/10.1145/2398776.2398790) 8 | 9 | 2. [Towards Street-Level Client-Independent IP Geolocation (NSDI 2011)](https://www.usenix.org/legacy/event/nsdi11/tech/full_papers/Wang_Yong.pdf). 10 | 11 | They are called million scale and street level papers throughout this README, as done in our paper. 12 | 13 | Our code offers the possibility to: 14 | 1. reproduce our results using our measurement datasets. 15 | 2. replicate our methodology with different targets and vantage points. For now, only RIPE Atlas vantage points are supported, but it should not be difficult to adapt the code to handle other vantage points and targets. 16 | 17 | ## Prerequisites 18 | Our code performs measurements on RIPE Atlas, so be sure to have an account if you want to replicate our methodology with your own RIPE Atlas measurements. 19 | 20 | ⚠️ **To replicate our RIPE Atlas measurements, you will need a lot of credits (millions)**. 21 | 22 | 23 | ## Table of contents 24 | 25 | - [Installation](#installation) 26 | - [Requirements](#requirements) 27 | - [Download datasets](#download-datasets) 28 | - [Clone the repository](#clone-the-repository) 29 | - [Installer](#installer) 30 | - [Install source files](#install-source-files) 31 | - [Clickhouse](#clickhouse) 32 | - [Settings](#settings) 33 | - [Further notice](#further-notice) 34 | - [Reproduction](#reproduction) 35 | - [Run your own measurements](#run-your-own-measurements) 36 | 37 | ## [Installation](#installation) 38 | 39 | ### [Requirements](#requirements) 40 | 41 | - [Python3.9](https://www.python.org/downloads/) (or above) 42 | - [Poetry](https://python-poetry.org/docs/) 43 | - [Docker](https://docs.docker.com/engine/install/) 44 | 45 | 46 | ### [Download datasets](#download-datasets) 47 | 48 | You can fetch our data our on FTP ftp.iris.dioptra.io that will give you the ClickHouse tables dumped in CSV format. 49 | 50 | ### [Clone the reprository](#clone-the-repository) 51 | 52 | ```bash 53 | git clone https://github.com/dioptra-io/geoloc-imc-2023.git 54 | cd geoloc-imc-2023 55 | ``` 56 | 57 | ### [Installer](#installer) 58 | 59 | You can use the script **install.sh** to: 60 | - Pull the clickhouse docker image. 61 | - Start the clickhouse server. 62 | - Download clickhouse-client binary. 63 | - Install python project using poetry. 64 | - Create all tables and populate the database with our measurements. 65 | 66 | ```bash 67 | source install.sh 68 | ``` 69 | If the installation fails, all necessary steps to use the project are described below. 70 | 71 | ### [Install source files](#install-source-files) 72 | 73 | GeoScale uses poetry has dependency manager, install the project using: 74 | ```bash 75 | poetry shell 76 | poetry lock 77 | poetry install 78 | ``` 79 | 80 | ### [Clickhouse](#clickhouse) 81 | 82 | We use docker to run clickhouse server, by default server is listening on localhost on port 8123 and tcp9000. If you prefer using your own docker configuration, please also modify [default.py](default.py) 83 | ```bash 84 | 85 | # pull the docker image 86 | docker pull clickhouse/clickhouse-server:22.6 87 | 88 | # start the server 89 | docker run --rm -d \ 90 | -v ./clickhouse_files/data:/var/lib/clickhouse/ \ 91 | -v ./clickhouse_files/logs:/var/log/clickhouse-server/ \ 92 | -v ./clickhouse_files/users.d:/etc/clickhouse-server/users.d:ro \ 93 | -v ./clickhouse_files/init-db.sh:/docker-entrypoint-initdb.d/init-db.sh \ 94 | -p 8123:8123 \ 95 | -p 9000:9000 \ 96 | --ulimit nofile=262144:262144 \ 97 | clickhouse/clickhouse-server:22.6 98 | ``` 99 | 100 | You can either install [clickhouse-client](https://clickhouse.com/docs/en/install) or download clikhouse client binary (by default, [install.sh](install.sh) download binary file). 101 | ```bash 102 | curl https://clickhouse.com/ | sh 103 | mv clickhouse ./clickhouse_files/ 104 | ``` 105 | 106 | Finally, create all necessary tables and populate it with our own measurements with: 107 | ```bash 108 | python scripts/utils/clickhouse_installer.py 109 | ``` 110 | 111 | 112 | ### [Setttings](#settings) 113 | 114 | Our tool relies on ENV variables for configuring clickhouse or interacting with RIPE Atlas API. 115 | An example of necessary ENV variables is given in [.env.example](.env.example). Create your own 116 | env file with following values: 117 | ```.env 118 | RIPE_USERNAME= 119 | RIPE_SECRET_KEY= 120 | ``` 121 | 122 | ⚠️ **IF** you used, your own clickhouse configuration, you can modify the following ENV: 123 | ``` 124 | # clickhouse settings 125 | CLICKHOUSE_CLIENT= 126 | CLICKHOUSE_HOST= 127 | CLICKHOUSE_DB= 128 | CLICKHOUSE_USER= 129 | CLICKHOUSE_PASSWORD= 130 | ``` 131 | ### [Further notice](#notice) 132 | 133 | #### Test environment 134 | 135 | The project has been run on: 136 | - CentOS 7.5 137 | - Python 3.9 138 | - Server with 64GB RAM, 32 cores. 139 | 140 | ⚠️ Some scripts and analysis can use a lot of CPU and RAM (tens of GB) and last for hours. 141 | 142 | 143 | ## [Reproducing our results](#reproduction) 144 | 145 | We provide python scripts and jupyter notebooks to reproduce the results and the graphs that we got in replicating the million scale and the street level papers. 146 | 147 | ### Million Scale 148 | 149 | You can reproduce Million scale results using a jupyter notebook: [million_scale.ipynb](./analysis/million_scale.ipynb) 150 | 151 | Alternatively you can also use the python script in background, as some steps are vey long to execute (several hours): 152 | ```bash 153 | nohup python analysis/million_scale.py > output.log & 154 | ``` 155 | 156 | All analysis results can be found in **./analysis/results** 157 | 158 | ### Street level 159 | 160 | ⚠️ The tier 1 of the Street-level replication (See the paper for more details) relies on results calculated by the million scale technique. You need to run the million scale notebook/scripts **before** running those of street-level. 161 | 162 | No additional steps are necessary to reproduce the street-level experiment. 163 | 164 | ### Generating figures 165 | 166 | You can directly use notebooks [plot.ipynb](./analysis/plot.ipynb) and [tables.ipynb](./analysis/tables.ipynb) to produce the figures and tables of our paper. 167 | 168 | ## [Run your own measurements](#run-your-own-measurements) 169 | 170 | You can also run your own measurements on custom datasets of targets (anchors) and vantage points (probes). 171 | 172 | ### First step: generate targets and vantage points datasets 173 | 174 | The jupyter notebook [create_dataset](./datasets/create_datasets.ipynb) will generate: 175 | - the set of probes (used as vantage points) 176 | - the set of anchors (used as targets) 177 | - filter both sets by removing problematic probes (wrongly geolocated for example) 178 | 179 | All generated files will be placed in /datasets/user_datasets. 180 | 181 | ### Second step: run measurements 182 | 183 | With [million_scale_measurements.ipynb](./measurements/million_scale_measurements.ipynb), you can select a subset of vantage points and targets and run measurements on RIPE Atlas. 184 | 185 | This script will start measurements for: 186 | 1. towards all targets from all vantage points 187 | 2. towards 3 responsive addresses for each target from all vantage points 188 | 189 | ⚠️ These measurements might cost a lot of RIPE Atlas credits and time if you run them on large datasets (default is only 2 targets and 4 vantage points). 190 | 191 | ### Third step: analyze your results 192 | 193 | Perform the analysis by using the same step described previously on your own measurements results and datasets by setting the boolean variable ```repro = True```, at the beginning of [million_scale.ipynb](./analysis/million_scale.ipynb) (or [million_scale.py](./analysis/million_scale.py) if you are using the script). 194 | 195 | 196 | 197 | TODO: Street level 198 | 199 | ## 📚 Publications 200 | 201 | ```bibtex 202 | @inproceedings{darwich2023replication, 203 | title={Replication: Towards a Publicly Available Internet scale IP Geolocation Dataset}, 204 | author={Darwich, Omar and Rimlinger, Hugo and Dreyfus, Milo and Gouel, Matthieu and Vermeulen, Kevin}, 205 | booktitle={Proceedings of the 2023 ACM on Internet Measurement Conference}, 206 | pages={1--15}, 207 | year={2023} 208 | } 209 | ``` 210 | 211 | 212 | ## 🧑‍💻 Authors 213 | 214 | This project is the result of a collaboration between the [LAAS-CNRS](https://www.laas.fr/public/) and [Sorbonne Université](https://www.sorbonne-universite.fr/). 215 | 216 | -------------------------------------------------------------------------------- /analysis/million_scale.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# First step of the analysis\n", 8 | "\n", 9 | "Preprocess results and save them before they can be plotted. \n", 10 | "\n", 11 | "To do after measurements notebooks" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from scripts.utils.file_utils import load_json, dump_json\n", 21 | "\n", 22 | "from scripts.analysis.analysis import *\n", 23 | "from default import *\n", 24 | "\n", 25 | "# set to True to use your own datasets/measurements\n", 26 | "run_repro = False\n", 27 | "if run_repro:\n", 28 | " # DATASET FILES\n", 29 | " PROBES_FILE = REPRO_PROBES_FILE\n", 30 | " PROBES_AND_ANCHORS_FILE = REPRO_PROBES_AND_ANCHORS_FILE\n", 31 | " FILTERED_PROBES_FILE = REPRO_FILTERED_PROBES_FILE\n", 32 | " GREEDY_PROBES_FILE = REPRO_GREEDY_PROBES_FILE\n", 33 | " PAIRWISE_DISTANCE_FILE = REPRO_PAIRWISE_DISTANCE_FILE\n", 34 | " VPS_TO_TARGET_TABLE = PROBES_TO_ANCHORS_PING_TABLE\n", 35 | " VPS_TO_PREFIX_TABLE = PROBES_TO_PREFIX_TABLE\n", 36 | "\n", 37 | " # RESULT FILES\n", 38 | " PROBES_TO_ANCHORS_RESULT_FILE = REPRO_PROBES_TO_ANCHORS_RESULT_FILE\n", 39 | " ROUND_BASED_ALGORITHM_FILE = REPRO_ROUND_BASED_ALGORITHM_FILE\n", 40 | " ACCURACY_VS_N_VPS_PROBES_FILE = REPRO_ACCURACY_VS_N_VPS_PROBES_FILE\n", 41 | " VP_SELECTION_ALGORITHM_PROBES_1_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_1_FILE\n", 42 | " VP_SELECTION_ALGORITHM_PROBES_3_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_3_FILE\n", 43 | " VP_SELECTION_ALGORITHM_PROBES_10_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_10_FILE\n", 44 | " \n", 45 | "else:\n", 46 | " # DATASET FILES\n", 47 | " PROBES_FILE = USER_PROBES_FILE\n", 48 | " PROBES_AND_ANCHORS_FILE = USER_PROBES_AND_ANCHORS_FILE\n", 49 | " FILTERED_PROBES_FILE = USER_FILTERED_PROBES_FILE\n", 50 | " GREEDY_PROBES_FILE = USER_GREEDY_PROBES_FILE\n", 51 | " PAIRWISE_DISTANCE_FILE = USER_PAIRWISE_DISTANCE_FILE\n", 52 | " VPS_TO_TARGET_TABLE = USER_VPS_TO_TARGET_TABLE\n", 53 | " VPS_TO_PREFIX_TABLE = USER_VPS_TO_PREFIX_TABLE\n", 54 | "\n", 55 | " # RESULT FILES\n", 56 | " PROBES_TO_ANCHORS_RESULT_FILE = USER_PROBES_TO_ANCHORS_RESULT_FILE\n", 57 | " ROUND_BASED_ALGORITHM_FILE = USER_ROUND_BASED_ALGORITHM_FILE\n", 58 | " ACCURACY_VS_N_VPS_PROBES_FILE = USER_ACCURACY_VS_N_VPS_PROBES_FILE\n", 59 | " VP_SELECTION_ALGORITHM_PROBES_1_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_1_FILE\n", 60 | " VP_SELECTION_ALGORITHM_PROBES_3_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_3_FILE\n", 61 | " VP_SELECTION_ALGORITHM_PROBES_10_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_10_FILE\n", 62 | "\n", 63 | "LIMIT = 1000" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "filtered_probes = load_json(FILTERED_PROBES_FILE)\n", 73 | "\n", 74 | "filter = \"\"\n", 75 | "if len(filtered_probes) > 0:\n", 76 | " # Remove probes that are wrongly geolocated\n", 77 | " in_clause = f\"\".join(\n", 78 | " [f\",toIPv4('{p}')\" for p in filtered_probes])[1:]\n", 79 | " filter += f\"AND dst not in ({in_clause}) AND src not in ({in_clause}) \"\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "## Compute errors\n", 87 | "\n", 88 | "Compute the median error between the guessed geolocations and the real geolocations" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n", 98 | "vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, vp_distance_matrix, probes_per_ip = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 5, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "rtt_per_srcs_dst = compute_rtts_per_dst_src(VPS_TO_TARGET_TABLE, filter, threshold=70)\n", 108 | "\n", 109 | "vps_per_target = {dst: set(vp_coordinates_per_ip.keys())\n", 110 | " for dst in rtt_per_srcs_dst}\n", 111 | "features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip, THRESHOLD_DISTANCES,\n", 112 | " vps_per_target=vps_per_target,\n", 113 | " distance_operator=\">\", max_vps=100000,\n", 114 | " is_use_prefix=False,\n", 115 | " vp_distance_matrix=vp_distance_matrix,\n", 116 | " )\n", 117 | "\n", 118 | "dump_json(features, PROBES_TO_ANCHORS_RESULT_FILE)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## Round Algorithm\n", 126 | "\n", 127 | "First is to use a subset of greedy probes, and then take 1 probe/AS in the given CBG area to compute the median error." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 6, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n", 137 | "\n", 138 | "asn_per_vp_ip = {}\n", 139 | "vp_coordinates_per_ip = {}\n", 140 | "\n", 141 | "for probe in all_probes:\n", 142 | " if \"address_v4\" in probe and \"geometry\" in probe and \"coordinates\" in probe[\"geometry\"]:\n", 143 | " ip_v4_address = probe[\"address_v4\"]\n", 144 | " if ip_v4_address is None:\n", 145 | " continue\n", 146 | " long, lat = probe[\"geometry\"][\"coordinates\"]\n", 147 | " asn_v4 = probe[\"asn_v4\"]\n", 148 | " asn_per_vp_ip[ip_v4_address] = asn_v4\n", 149 | " vp_coordinates_per_ip[ip_v4_address] = lat, long\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 7, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# clickhouse is required here\n", 159 | "rtt_per_srcs_dst = compute_rtts_per_dst_src(VPS_TO_TARGET_TABLE, filter, threshold=100)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 8, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "vp_distance_matrix = load_json(PAIRWISE_DISTANCE_FILE)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 9, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "Using 10 tier1_vps\n", 181 | "Using 100 tier1_vps\n", 182 | "Using 300 tier1_vps\n", 183 | "Using 500 tier1_vps\n", 184 | "Using 1000 tier1_vps\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "TIER1_VPS = [10, 100, 300, 500, 1000]\n", 190 | "greedy_probes = load_json(GREEDY_PROBES_FILE)\n", 191 | "error_cdf_per_tier1_vps = {}\n", 192 | "for tier1_vps in TIER1_VPS:\n", 193 | " print(f\"Using {tier1_vps} tier1_vps\")\n", 194 | " error_cdf = round_based_algorithm(greedy_probes, rtt_per_srcs_dst, vp_coordinates_per_ip,\n", 195 | " asn_per_vp_ip,\n", 196 | " tier1_vps,\n", 197 | " threshold=40)\n", 198 | " error_cdf_per_tier1_vps[tier1_vps] = error_cdf\n", 199 | " \n", 200 | "dump_json(error_cdf_per_tier1_vps, ROUND_BASED_ALGORITHM_FILE)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "## Accuracy vs number of vps probes\n", 208 | "WARNING : Time consumming section \n", 209 | "\n", 210 | "Compute median error for each target, depending on the number of initial VPs." 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 10, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n", 220 | "\n", 221 | "vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, \\\n", 222 | " vp_distance_matrix, probe_per_ip = compute_geo_info(\n", 223 | " all_probes, serialized_file=PAIRWISE_DISTANCE_FILE)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 12, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "name": "stderr", 233 | "output_type": "stream", 234 | "text": [ 235 | "2023-09-13 16:22:03::INFO:root:analysis:: Starting computing for random VPs 100\n", 236 | "2023-09-13 16:23:13::INFO:root:analysis:: Starting computing for random VPs 200\n", 237 | "2023-09-13 16:24:21::INFO:root:analysis:: Starting computing for random VPs 300\n", 238 | "2023-09-13 16:25:31::INFO:root:analysis:: Starting computing for random VPs 400\n" 239 | ] 240 | } 241 | ], 242 | "source": [ 243 | "subset_sizes = []\n", 244 | "subset_sizes.extend([i for i in range(100, 500, 100)])\n", 245 | "# subset_sizes.extend([i for i in range(1000, 10001, 1000)])\n", 246 | "\n", 247 | "rtt_per_srcs_dst = compute_rtts_per_dst_src(VPS_TO_TARGET_TABLE, filter, threshold=50)\n", 248 | "\n", 249 | "available_vps = list(vp_coordinates_per_ip.keys())\n", 250 | "accuracy_vs_nb_vps = compute_accuracy_vs_number_of_vps(available_vps, rtt_per_srcs_dst, vp_coordinates_per_ip,\n", 251 | " vp_distance_matrix, subset_sizes)\n", 252 | "\n", 253 | "dump_json(accuracy_vs_nb_vps, ACCURACY_VS_N_VPS_PROBES_FILE)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "## VPs selection algorithm\n", 261 | "\n", 262 | "Select respectively the 1, 3, and 10 closest probes (with minimal round trip time) for each target." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 13, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "all_probes = load_json(PROBES_AND_ANCHORS_FILE)\n", 272 | "\n", 273 | "vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, vp_distance_matrix, probes_per_ip = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 16, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "ping_table_prefix = VPS_TO_PREFIX_TABLE\n", 283 | "ping_table = VPS_TO_TARGET_TABLE\n", 284 | "N_VPS_SELECTION_ALGORITHM = [1, 3, 10]\n", 285 | "results_files = [VP_SELECTION_ALGORITHM_PROBES_1_FILE, VP_SELECTION_ALGORITHM_PROBES_3_FILE, VP_SELECTION_ALGORITHM_PROBES_10_FILE]\n", 286 | "\n", 287 | "rtt_per_srcs_dst_prefix = compute_rtts_per_dst_src(ping_table_prefix, filter, threshold=100, is_per_prefix=True)\n", 288 | "rtt_per_srcs_dst = compute_rtts_per_dst_src(ping_table, filter, threshold=70)\n", 289 | "\n", 290 | "for i, n_vp in enumerate(N_VPS_SELECTION_ALGORITHM):\n", 291 | " vps_per_target = compute_closest_rtt_probes(rtt_per_srcs_dst_prefix,\n", 292 | " vp_coordinates_per_ip,\n", 293 | " vp_distance_matrix,\n", 294 | " n_shortest=n_vp,\n", 295 | " is_prefix=True)\n", 296 | " features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip,\n", 297 | " [0],\n", 298 | " vps_per_target=vps_per_target,\n", 299 | " distance_operator=\">\", max_vps=100000,\n", 300 | " is_use_prefix=True,\n", 301 | " vp_distance_matrix=vp_distance_matrix,\n", 302 | " is_multiprocess=True)\n", 303 | " \n", 304 | " ofile = results_files[i]\n", 305 | " dump_json(features, ofile)" 306 | ] 307 | } 308 | ], 309 | "metadata": { 310 | "kernelspec": { 311 | "display_name": "review-fXCvvitn-py3.10", 312 | "language": "python", 313 | "name": "python3" 314 | }, 315 | "language_info": { 316 | "codemirror_mode": { 317 | "name": "ipython", 318 | "version": 3 319 | }, 320 | "file_extension": ".py", 321 | "mimetype": "text/x-python", 322 | "name": "python", 323 | "nbconvert_exporter": "python", 324 | "pygments_lexer": "ipython3", 325 | "version": "3.10.9" 326 | }, 327 | "orig_nbformat": 4 328 | }, 329 | "nbformat": 4, 330 | "nbformat_minor": 2 331 | } 332 | -------------------------------------------------------------------------------- /analysis/million_scale.py: -------------------------------------------------------------------------------- 1 | from scripts.utils.file_utils import load_json, dump_json 2 | 3 | from scripts.analysis.analysis import * 4 | from default import * 5 | 6 | 7 | if __name__ == "__main__": 8 | # set to True to use your own datasets/measurements 9 | run_repro = True 10 | if run_repro: 11 | # DATASET FILES 12 | PROBES_FILE = REPRO_PROBES_FILE 13 | PROBES_AND_ANCHORS_FILE = REPRO_PROBES_AND_ANCHORS_FILE 14 | FILTERED_PROBES_FILE = REPRO_FILTERED_PROBES_FILE 15 | GREEDY_PROBES_FILE = REPRO_GREEDY_PROBES_FILE 16 | PAIRWISE_DISTANCE_FILE = REPRO_PAIRWISE_DISTANCE_FILE 17 | VPS_TO_TARGET_TABLE = PROBES_TO_ANCHORS_PING_TABLE 18 | VPS_TO_PREFIX_TABLE = PROBES_TO_PREFIX_TABLE 19 | 20 | # RESULT FILES 21 | PROBES_TO_ANCHORS_RESULT_FILE = REPRO_PROBES_TO_ANCHORS_RESULT_FILE 22 | ROUND_BASED_ALGORITHM_FILE = REPRO_ROUND_BASED_ALGORITHM_FILE 23 | ACCURACY_VS_N_VPS_PROBES_FILE = REPRO_ACCURACY_VS_N_VPS_PROBES_FILE 24 | VP_SELECTION_ALGORITHM_PROBES_1_FILE = ( 25 | REPRO_VP_SELECTION_ALGORITHM_PROBES_1_FILE 26 | ) 27 | VP_SELECTION_ALGORITHM_PROBES_3_FILE = ( 28 | REPRO_VP_SELECTION_ALGORITHM_PROBES_3_FILE 29 | ) 30 | VP_SELECTION_ALGORITHM_PROBES_10_FILE = ( 31 | REPRO_VP_SELECTION_ALGORITHM_PROBES_10_FILE 32 | ) 33 | 34 | else: 35 | # DATASET FILES 36 | PROBES_FILE = USER_PROBES_FILE 37 | PROBES_AND_ANCHORS_FILE = USER_PROBES_AND_ANCHORS_FILE 38 | FILTERED_PROBES_FILE = USER_FILTERED_PROBES_FILE 39 | GREEDY_PROBES_FILE = USER_GREEDY_PROBES_FILE 40 | PAIRWISE_DISTANCE_FILE = USER_PAIRWISE_DISTANCE_FILE 41 | VPS_TO_TARGET_TABLE = USER_VPS_TO_TARGET_TABLE 42 | VPS_TO_PREFIX_TABLE = USER_VPS_TO_PREFIX_TABLE 43 | 44 | # RESULT FILES 45 | PROBES_TO_ANCHORS_RESULT_FILE = USER_PROBES_TO_ANCHORS_RESULT_FILE 46 | ROUND_BASED_ALGORITHM_FILE = USER_ROUND_BASED_ALGORITHM_FILE 47 | ACCURACY_VS_N_VPS_PROBES_FILE = USER_ACCURACY_VS_N_VPS_PROBES_FILE 48 | VP_SELECTION_ALGORITHM_PROBES_1_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_1_FILE 49 | VP_SELECTION_ALGORITHM_PROBES_3_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_3_FILE 50 | VP_SELECTION_ALGORITHM_PROBES_10_FILE = ( 51 | USER_VP_SELECTION_ALGORITHM_PROBES_10_FILE 52 | ) 53 | 54 | LIMIT = 1000 55 | 56 | filtered_probes = load_json(FILTERED_PROBES_FILE) 57 | 58 | filter = "" 59 | if len(filtered_probes) > 0: 60 | # Remove probes that are wrongly geolocated 61 | in_clause = f"".join([f",toIPv4('{p}')" for p in filtered_probes])[1:] 62 | filter += f"AND dst not in ({in_clause}) AND src not in ({in_clause}) " 63 | 64 | logger.info("Step 1: Compute errors") 65 | 66 | all_probes = load_json(PROBES_AND_ANCHORS_FILE) 67 | ( 68 | vp_coordinates_per_ip, 69 | ip_per_coordinates, 70 | country_per_vp, 71 | asn_per_vp, 72 | vp_distance_matrix, 73 | probes_per_ip, 74 | ) = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE) 75 | 76 | rtt_per_srcs_dst = compute_rtts_per_dst_src( 77 | PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=70 78 | ) 79 | 80 | vps_per_target = { 81 | dst: set(vp_coordinates_per_ip.keys()) for dst in rtt_per_srcs_dst 82 | } 83 | features = compute_geolocation_features_per_ip( 84 | rtt_per_srcs_dst, 85 | vp_coordinates_per_ip, 86 | THRESHOLD_DISTANCES, 87 | vps_per_target=vps_per_target, 88 | distance_operator=">", 89 | max_vps=100000, 90 | is_use_prefix=False, 91 | vp_distance_matrix=vp_distance_matrix, 92 | ) 93 | 94 | dump_json(features, PROBES_TO_ANCHORS_RESULT_FILE) 95 | 96 | logger.info("Step 2: Round Algorithm") 97 | 98 | all_probes = load_json(PROBES_AND_ANCHORS_FILE) 99 | 100 | asn_per_vp_ip = {} 101 | vp_coordinates_per_ip = {} 102 | 103 | for probe in all_probes: 104 | if ( 105 | "address_v4" in probe 106 | and "geometry" in probe 107 | and "coordinates" in probe["geometry"] 108 | ): 109 | ip_v4_address = probe["address_v4"] 110 | if ip_v4_address is None: 111 | continue 112 | long, lat = probe["geometry"]["coordinates"] 113 | asn_v4 = probe["asn_v4"] 114 | asn_per_vp_ip[ip_v4_address] = asn_v4 115 | vp_coordinates_per_ip[ip_v4_address] = lat, long 116 | 117 | # clickhouse is required here 118 | rtt_per_srcs_dst = compute_rtts_per_dst_src( 119 | PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=100 120 | ) 121 | vp_distance_matrix = load_json(PAIRWISE_DISTANCE_FILE) 122 | 123 | TIER1_VPS = [10, 100, 300, 500, 1000] 124 | greedy_probes = load_json(GREEDY_PROBES_FILE) 125 | error_cdf_per_tier1_vps = {} 126 | for tier1_vps in TIER1_VPS: 127 | print(f"Using {tier1_vps} tier1_vps") 128 | error_cdf = round_based_algorithm( 129 | greedy_probes, 130 | rtt_per_srcs_dst, 131 | vp_coordinates_per_ip, 132 | asn_per_vp_ip, 133 | tier1_vps, 134 | threshold=40, 135 | ) 136 | error_cdf_per_tier1_vps[tier1_vps] = error_cdf 137 | 138 | dump_json(error_cdf_per_tier1_vps, ROUND_BASED_ALGORITHM_FILE) 139 | 140 | logger.info("Accuracy vs number of vps probes") 141 | logger.warning("this step might takes several hours") 142 | 143 | all_probes = load_json(PROBES_AND_ANCHORS_FILE) 144 | 145 | ( 146 | vp_coordinates_per_ip, 147 | ip_per_coordinates, 148 | country_per_vp, 149 | asn_per_vp, 150 | vp_distance_matrix, 151 | probe_per_ip, 152 | ) = compute_geo_info(all_probes, serialized_file=PAIRWISE_DISTANCE_FILE) 153 | 154 | logger.info("Accuracy vs number of vps probes") 155 | 156 | subset_sizes = [] 157 | subset_sizes.extend([i for i in range(100, 1000, 100)]) 158 | # subset_sizes.extend([i for i in range(1000, 10001, 1000)]) 159 | 160 | rtt_per_srcs_dst = compute_rtts_per_dst_src( 161 | PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=50 162 | ) 163 | 164 | available_vps = list(vp_coordinates_per_ip.keys()) 165 | accuracy_vs_nb_vps = compute_accuracy_vs_number_of_vps( 166 | available_vps, 167 | rtt_per_srcs_dst, 168 | vp_coordinates_per_ip, 169 | vp_distance_matrix, 170 | subset_sizes, 171 | ) 172 | 173 | dump_json(accuracy_vs_nb_vps, ACCURACY_VS_N_VPS_PROBES_FILE) 174 | 175 | logger.info("vp selection algorithm") 176 | 177 | all_probes = load_json(PROBES_AND_ANCHORS_FILE) 178 | 179 | ( 180 | vp_coordinates_per_ip, 181 | ip_per_coordinates, 182 | country_per_vp, 183 | asn_per_vp, 184 | vp_distance_matrix, 185 | probes_per_ip, 186 | ) = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE) 187 | 188 | ping_table_prefix = PROBES_TO_PREFIX_TABLE 189 | ping_table = PROBES_TO_ANCHORS_PING_TABLE 190 | N_VPS_SELECTION_ALGORITHM = [1, 3, 10] 191 | results_files = [ 192 | VP_SELECTION_ALGORITHM_PROBES_1_FILE, 193 | VP_SELECTION_ALGORITHM_PROBES_3_FILE, 194 | VP_SELECTION_ALGORITHM_PROBES_10_FILE, 195 | ] 196 | 197 | rtt_per_srcs_dst_prefix = compute_rtts_per_dst_src( 198 | ping_table_prefix, filter, threshold=100, is_per_prefix=True 199 | ) 200 | rtt_per_srcs_dst = compute_rtts_per_dst_src(ping_table, filter, threshold=70) 201 | 202 | for i, n_vp in enumerate(N_VPS_SELECTION_ALGORITHM): 203 | vps_per_target = compute_closest_rtt_probes( 204 | rtt_per_srcs_dst_prefix, 205 | vp_coordinates_per_ip, 206 | vp_distance_matrix, 207 | n_shortest=n_vp, 208 | is_prefix=True, 209 | ) 210 | features = compute_geolocation_features_per_ip( 211 | rtt_per_srcs_dst, 212 | vp_coordinates_per_ip, 213 | [0], 214 | vps_per_target=vps_per_target, 215 | distance_operator=">", 216 | max_vps=100000, 217 | is_use_prefix=True, 218 | vp_distance_matrix=vp_distance_matrix, 219 | is_multiprocess=True, 220 | ) 221 | 222 | ofile = results_files[i] 223 | dump_json(features, ofile) 224 | -------------------------------------------------------------------------------- /analysis/ripe_atlas_probes_bias.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "import pandas as pd\n", 11 | "\n", 12 | "from default import ASNS_TYPE_CAIDA, ASNS_TYPE_STANFORD, REPRO_PROBES_AND_ANCHORS_FILE, REPRO_ANCHORS_FILE, REPRO_PROBES_FILE" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "# load datasets" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "with ASNS_TYPE_CAIDA.open(\"r\") as f:\n", 29 | " asns_categories_caida = json.load(f)\n", 30 | "\n", 31 | "with ASNS_TYPE_STANFORD.open(\"r\") as f:\n", 32 | " asns_categories_stanford = json.load(f)\n", 33 | " \n", 34 | "with REPRO_PROBES_AND_ANCHORS_FILE.open(\"r\") as f:\n", 35 | " probes_and_anchors = json.load(f)\n", 36 | "\n", 37 | "with REPRO_PROBES_FILE.open(\"r\") as f:\n", 38 | " probes = json.load(f)\n", 39 | "\n", 40 | "with REPRO_ANCHORS_FILE.open(\"r\") as f:\n", 41 | " anchors = json.load(f)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "def get_anchor_as_category(asns_category: dict, ripe_vps_dataset: dict) -> dict:\n", 51 | " \"\"\"return one category per anchor\"\"\"\n", 52 | " ripe_categories = []\n", 53 | "\n", 54 | " for ripe_vp in ripe_vps_dataset:\n", 55 | " try:\n", 56 | " ripe_categories.append({\n", 57 | " \"id\": ripe_vp['id'],\n", 58 | " \"category\": asns_category[str(ripe_vp[\"asn_v4\"])]\n", 59 | " })\n", 60 | " except KeyError:\n", 61 | " ripe_categories.append({\n", 62 | " \"id\": ripe_vp['id'],\n", 63 | " \"category\": \"Unknown\"\n", 64 | " })\n", 65 | " continue\n", 66 | " return ripe_categories\n", 67 | "\n", 68 | "def get_categories_percentage(categories_df: pd.DataFrame) -> dict:\n", 69 | " \"\"\"get percentage per categories from a set of categories\"\"\"\n", 70 | " category_repartition = dict()\n", 71 | "\n", 72 | " category_set = categories_df[\"category\"].unique()\n", 73 | " for category in category_set:\n", 74 | " percentage = len(categories_df[categories_df[\"category\"] == category]) * 100 / len(categories_df[\"id\"])\n", 75 | " category_repartition[category] = percentage\n", 76 | "\n", 77 | " print(f\"{category} : {len(categories_df[categories_df['category'] == category])} ({round(percentage,1)}%)\")\n", 78 | "\n", 79 | " assert round(sum([v for v in category_repartition.values()])) == 100 \n", 80 | "\n", 81 | " return category_repartition" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "# Get targets type" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "category_caida_anchors = get_anchor_as_category(asns_categories_caida, anchors)\n", 98 | "category_caida_probes = get_anchor_as_category(asns_categories_caida, probes)\n", 99 | "category_caida_probes_and_anchors = get_anchor_as_category(asns_categories_caida, probes_and_anchors)\n", 100 | "\n", 101 | "category_stanford_anchors = get_anchor_as_category(asns_categories_stanford, anchors)\n", 102 | "category_stanford_probes = get_anchor_as_category(asns_categories_stanford, probes)\n", 103 | "category_stanford_probes_and_anchors = get_anchor_as_category(asns_categories_stanford, probes_and_anchors)\n", 104 | "\n", 105 | "caida_df_anchors = pd.DataFrame(category_caida_anchors, columns=[\"id\", \"category\"])\n", 106 | "caida_df_probes = pd.DataFrame(category_caida_probes, columns=[\"id\", \"category\"])\n", 107 | "caida_df_probes_and_anchors = pd.DataFrame(category_caida_probes_and_anchors, columns=[\"id\", \"category\"])\n", 108 | "\n", 109 | "stanford_df_anchors = pd.DataFrame(category_stanford_anchors, columns=[\"id\", \"category\"])\n", 110 | "stanford_df_probes = pd.DataFrame(category_stanford_probes, columns=[\"id\", \"category\"])\n", 111 | "stanford_df_probes_and_anchors = pd.DataFrame(category_stanford_probes_and_anchors, columns=[\"id\", \"category\"])" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "# Caida categories" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 5, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "Anchors results: \n", 131 | "\n", 132 | "Content : 229 (31.7%)\n", 133 | "Access : 211 (29.2%)\n", 134 | "Transit/Access : 197 (27.2%)\n", 135 | "Enterprise : 55 (7.6%)\n", 136 | "tier-1 : 6 (0.8%)\n", 137 | "Unknown : 25 (3.5%)\n", 138 | "\n", 139 | "Probes results: \n", 140 | "\n", 141 | "Access : 9124 (75.2%)\n", 142 | "Transit/Access : 1005 (8.3%)\n", 143 | "Enterprise : 410 (3.4%)\n", 144 | "Unknown : 312 (2.6%)\n", 145 | "Content : 1112 (9.2%)\n", 146 | "tier-1 : 166 (1.4%)\n", 147 | "\n", 148 | "Probes and anchors results: \n", 149 | "\n", 150 | "Access : 9347 (72.4%)\n", 151 | "Transit/Access : 1221 (9.5%)\n", 152 | "Enterprise : 472 (3.7%)\n", 153 | "Unknown : 339 (2.6%)\n", 154 | "Content : 1361 (10.5%)\n", 155 | "tier-1 : 174 (1.3%)\n", 156 | "\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "print(\"Anchors results: \\n\")\n", 162 | "ripe_vps_categories_caida = get_categories_percentage(caida_df_anchors)\n", 163 | "print()\n", 164 | "\n", 165 | "print(\"Probes results: \\n\")\n", 166 | "ripe_vps_categories_caida = get_categories_percentage(caida_df_probes)\n", 167 | "print()\n", 168 | "\n", 169 | "print(\"Probes and anchors results: \\n\")\n", 170 | "ripe_vps_categories_caida = get_categories_percentage(caida_df_probes_and_anchors)\n", 171 | "print()" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "# Stanford categories" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 6, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "Anchors results: \n", 191 | "\n", 192 | "Computer and Information Technology : 521 (72.1%)\n", 193 | "Education and Research : 38 (5.3%)\n", 194 | "Community Groups and Nonprofits : 33 (4.6%)\n", 195 | "Health Care Services : 2 (0.3%)\n", 196 | "Finance and Insurance : 6 (0.8%)\n", 197 | "Unknown : 53 (7.3%)\n", 198 | "Media, Publishing, and Broadcasting : 21 (2.9%)\n", 199 | "Service : 25 (3.5%)\n", 200 | "Construction and Real Estate : 5 (0.7%)\n", 201 | "Travel and Accommodation : 2 (0.3%)\n", 202 | "Government and Public Administration : 3 (0.4%)\n", 203 | "Retail Stores, Wholesale, and E-commerce Sites : 5 (0.7%)\n", 204 | "Utilities (Excluding Internet Service) : 1 (0.1%)\n", 205 | "Manufacturing : 2 (0.3%)\n", 206 | "Other : 4 (0.6%)\n", 207 | "Museums, Libraries, and Entertainment : 1 (0.1%)\n", 208 | "Freight, Shipment, and Postal Services : 1 (0.1%)\n", 209 | "\n", 210 | "Probes results: \n", 211 | "\n", 212 | "Computer and Information Technology : 10028 (82.7%)\n", 213 | "Community Groups and Nonprofits : 129 (1.1%)\n", 214 | "Unknown : 842 (6.9%)\n", 215 | "Education and Research : 352 (2.9%)\n", 216 | "Construction and Real Estate : 60 (0.5%)\n", 217 | "Manufacturing : 25 (0.2%)\n", 218 | "Service : 300 (2.5%)\n", 219 | "Media, Publishing, and Broadcasting : 183 (1.5%)\n", 220 | "Other : 14 (0.1%)\n", 221 | "Retail Stores, Wholesale, and E-commerce Sites : 105 (0.9%)\n", 222 | "Government and Public Administration : 18 (0.1%)\n", 223 | "Health Care Services : 8 (0.1%)\n", 224 | "Finance and Insurance : 22 (0.2%)\n", 225 | "Utilities (Excluding Internet Service) : 16 (0.1%)\n", 226 | "Museums, Libraries, and Entertainment : 8 (0.1%)\n", 227 | "Travel and Accommodation : 10 (0.1%)\n", 228 | "Agriculture, Mining, and Refineries (Farming, Greenhouses, Mining, Forestry, and Animal Farming) : 4 (0.0%)\n", 229 | "Freight, Shipment, and Postal Services : 5 (0.0%)\n", 230 | "\n", 231 | "Probes and anchors results: \n", 232 | "\n", 233 | "Computer and Information Technology : 10590 (82.0%)\n", 234 | "Community Groups and Nonprofits : 163 (1.3%)\n", 235 | "Unknown : 901 (7.0%)\n", 236 | "Education and Research : 393 (3.0%)\n", 237 | "Construction and Real Estate : 65 (0.5%)\n", 238 | "Manufacturing : 27 (0.2%)\n", 239 | "Service : 328 (2.5%)\n", 240 | "Media, Publishing, and Broadcasting : 206 (1.6%)\n", 241 | "Other : 19 (0.1%)\n", 242 | "Retail Stores, Wholesale, and E-commerce Sites : 115 (0.9%)\n", 243 | "Government and Public Administration : 21 (0.2%)\n", 244 | "Health Care Services : 10 (0.1%)\n", 245 | "Finance and Insurance : 28 (0.2%)\n", 246 | "Utilities (Excluding Internet Service) : 17 (0.1%)\n", 247 | "Museums, Libraries, and Entertainment : 9 (0.1%)\n", 248 | "Travel and Accommodation : 12 (0.1%)\n", 249 | "Agriculture, Mining, and Refineries (Farming, Greenhouses, Mining, Forestry, and Animal Farming) : 4 (0.0%)\n", 250 | "Freight, Shipment, and Postal Services : 6 (0.0%)\n", 251 | "\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "print(\"Anchors results: \\n\")\n", 257 | "ripe_vps_categories_caida = get_categories_percentage(stanford_df_anchors)\n", 258 | "print()\n", 259 | "\n", 260 | "print(\"Probes results: \\n\")\n", 261 | "ripe_vps_categories_caida = get_categories_percentage(stanford_df_probes)\n", 262 | "print()\n", 263 | "\n", 264 | "print(\"Probes and anchors results: \\n\")\n", 265 | "ripe_vps_categories_caida = get_categories_percentage(stanford_df_probes_and_anchors)\n", 266 | "print()" 267 | ] 268 | } 269 | ], 270 | "metadata": { 271 | "kernelspec": { 272 | "display_name": "review-QY-dYH-y-py3.10", 273 | "language": "python", 274 | "name": "python3" 275 | }, 276 | "language_info": { 277 | "codemirror_mode": { 278 | "name": "ipython", 279 | "version": 3 280 | }, 281 | "file_extension": ".py", 282 | "mimetype": "text/x-python", 283 | "name": "python", 284 | "nbconvert_exporter": "python", 285 | "pygments_lexer": "ipython3", 286 | "version": "3.10.9" 287 | }, 288 | "orig_nbformat": 4 289 | }, 290 | "nbformat": 4, 291 | "nbformat_minor": 2 292 | } 293 | -------------------------------------------------------------------------------- /analysis/tables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Print tables\n", 8 | "\n", 9 | "Print all the tables of the replication paper \n", 10 | "To do after analysis/million_scale.ipynb" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import pyasn\n", 20 | "\n", 21 | "from ipaddress import ip_network\n", 22 | "from clickhouse_driver import Client\n", 23 | "\n", 24 | "from scripts.utils.file_utils import load_json\n", 25 | "from scripts.utils.clickhouse import Clickhouse\n", 26 | "from scripts.analysis.analysis import get_all_bgp_prefixes, is_same_bgp_prefix, every_tier_result_and_errors\n", 27 | "from scripts.utils.helpers import haversine\n", 28 | "from default import IP_TO_ASN_FILE, ANALYZABLE_FILE, ROUND_BASED_ALGORITHM_FILE, TARGET_TO_LANDMARKS_PING_TABLE" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Measurement overhead" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Figure 3.c of the replication paper" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "round_based_algorithm_results = load_json(ROUND_BASED_ALGORITHM_FILE)\n", 52 | "\n", 53 | "round_based_algorithm_results = {\n", 54 | "int(x): round_based_algorithm_results[x] for x in round_based_algorithm_results}" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "10 5785182\n", 67 | "100 4459050\n", 68 | "300 3205290\n", 69 | "500 2800245\n", 70 | "1000 2817933\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "for tier1_vps, results in sorted(round_based_algorithm_results.items()):\n", 76 | " tier1_vps = int(tier1_vps)\n", 77 | " n_vps_cdf = [r[2] + tier1_vps for r in results if r[2] is not None]\n", 78 | " print(tier1_vps, 3 * sum(n_vps_cdf))" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "## Number of landmarks within a certain radius" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Figure 5.b of the replication paper" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 3, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "Found 78.128.211.119 with a landmark in the same /24\n", 105 | "Found 77.109.180.62 with a landmark in the same /24\n", 106 | "Found 103.143.136.43 with a landmark in the same /24\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "data = load_json(ANALYZABLE_FILE)\n", 112 | "\n", 113 | "valid_landmarks_count = 0\n", 114 | "unvalid_landmarks_count = 0\n", 115 | "same_asn_lst = []\n", 116 | "same_24_lst = []\n", 117 | "same_bgp_lst = []\n", 118 | "distances_to_landmarks = []\n", 119 | "all_landmarks = []\n", 120 | "asndb = pyasn.pyasn(str(IP_TO_ASN_FILE))\n", 121 | "bgp_prefixes = get_all_bgp_prefixes()\n", 122 | "\n", 123 | "for _, d in data.items():\n", 124 | " same_asn = 0\n", 125 | " diff_asn = 0\n", 126 | " same_bgp = 0\n", 127 | " diff_bgp = 0\n", 128 | " same_24 = 0\n", 129 | " diff_24 = 0\n", 130 | " all_landmarks.append(0)\n", 131 | " if \"tier2:cdn_count\" in d and \"tier2:landmark_count\" in d and \"tier2:failed_header_test_count\" in d:\n", 132 | " all_landmarks[-1] += d['tier2:landmark_count'] + \\\n", 133 | " d['tier2:cdn_count'] + d['tier2:failed_header_test_count']\n", 134 | " valid_landmarks_count += d['tier2:landmark_count']\n", 135 | " unvalid_landmarks_count += d['tier2:cdn_count'] + \\\n", 136 | " d['tier2:failed_header_test_count']\n", 137 | " if \"tier3:cdn_count\" in d and \"tier3:landmark_count\" in d and \"tier3:failed_header_test_count\" in d:\n", 138 | " all_landmarks[-1] += d['tier3:landmark_count'] + \\\n", 139 | " d['tier3:cdn_count'] + d['tier3:failed_header_test_count']\n", 140 | " valid_landmarks_count += d['tier3:landmark_count']\n", 141 | " unvalid_landmarks_count += d['tier3:cdn_count'] + \\\n", 142 | " d['tier3:failed_header_test_count']\n", 143 | " for f in ['tier2:traceroutes', 'tier3:traceroutes']:\n", 144 | " if f in d:\n", 145 | " for t in d[f]:\n", 146 | "\n", 147 | " ipt = t[1]\n", 148 | " ipl = t[2]\n", 149 | " asnt = asndb.lookup(ipt)[0]\n", 150 | " asnl = asndb.lookup(ipl)[0]\n", 151 | " if asnl != None and asnt != None:\n", 152 | " if asnt == asnl:\n", 153 | " same_asn += 1\n", 154 | " else:\n", 155 | " diff_asn += 1\n", 156 | "\n", 157 | " nt = ip_network(ipt+\"/24\", strict=False).network_address\n", 158 | " nl = ip_network(ipl+\"/24\", strict=False).network_address\n", 159 | " if nt == nl:\n", 160 | " same_24 += 1\n", 161 | " else:\n", 162 | " diff_24 += 1\n", 163 | "\n", 164 | " if is_same_bgp_prefix(ipt, ipl, bgp_prefixes):\n", 165 | " same_bgp += 1\n", 166 | " else:\n", 167 | " diff_bgp += 1\n", 168 | "\n", 169 | " distances = []\n", 170 | " for f in ['tier2:landmarks', 'tier3:landmarks']:\n", 171 | " target_geo = (d['RIPE:lat'], d['RIPE:lon'])\n", 172 | " if f in d:\n", 173 | " for l in d[f]:\n", 174 | " landmark_geo = (l[2], l[3])\n", 175 | " distances.append(haversine(target_geo, landmark_geo))\n", 176 | " distances_to_landmarks.append(distances)\n", 177 | "\n", 178 | " if same_asn != 0 or diff_asn != 0:\n", 179 | " same_asn_lst.append(same_asn/(same_asn+diff_asn))\n", 180 | "\n", 181 | " if same_24 != 0 or diff_24 != 0:\n", 182 | " same_24_lst.append(same_24/(same_24+diff_24))\n", 183 | " if same_24 != 0:\n", 184 | " print(\n", 185 | " f\"Found {d['target_ip']} with a landmark in the same /24\")\n", 186 | " if same_bgp != 0 or diff_bgp != 0:\n", 187 | " same_bgp_lst.append(same_bgp/(diff_bgp+same_bgp))" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 4, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "713 target have potentail landmarks or 0.9861687413554634\n", 200 | "677 target have valid landmarks or 0.9363762102351314\n", 201 | "207 target with a landmark within 1 km or 0.2863070539419087\n", 202 | "419 target with a landmark within 5 km or 0.5795297372060858\n", 203 | "464 target with a landmark within 10 km or 0.6417704011065007\n", 204 | "552 target with a landmark within 40 km or 0.7634854771784232\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "landmarks_all = []\n", 210 | "landmarks_less_1 = []\n", 211 | "landmarks_less_5 = []\n", 212 | "landmarks_less_10 = []\n", 213 | "landmarks_less_40 = []\n", 214 | "\n", 215 | "for landmark_distances in distances_to_landmarks:\n", 216 | " landmarks_all.append(len(landmark_distances))\n", 217 | " landmarks_less_1.append(len([i for i in landmark_distances if i <= 1]))\n", 218 | " landmarks_less_5.append(len([i for i in landmark_distances if i <= 5]))\n", 219 | " landmarks_less_10.append(\n", 220 | " len([i for i in landmark_distances if i <= 10]))\n", 221 | " landmarks_less_40.append(\n", 222 | " len([i for i in landmark_distances if i <= 40]))\n", 223 | "\n", 224 | "lm_a_0 = len([i for i in all_landmarks if i > 0])\n", 225 | "lmv_a_0 = len([i for i in landmarks_all if i > 0])\n", 226 | "lm1_0 = len([i for i in landmarks_less_1 if i > 0])\n", 227 | "lm5_0 = len([i for i in landmarks_less_5 if i > 0])\n", 228 | "lm10_0 = len([i for i in landmarks_less_10 if i > 0])\n", 229 | "lm40_0 = len([i for i in landmarks_less_40 if i > 0])\n", 230 | "\n", 231 | "\n", 232 | "len_all = len(data)\n", 233 | "print(f\"{lm_a_0} target have potentail landmarks or {lm_a_0/len_all}\")\n", 234 | "print(f\"{lmv_a_0} target have valid landmarks or {lmv_a_0/len_all}\")\n", 235 | "print(f\"{lm1_0} target with a landmark within 1 km or {lm1_0/len_all}\")\n", 236 | "print(f\"{lm5_0} target with a landmark within 5 km or {lm5_0/len_all}\")\n", 237 | "print(f\"{lm10_0} target with a landmark within 10 km or {lm10_0/len_all}\")\n", 238 | "print(f\"{lm40_0} target with a landmark within 40 km or {lm40_0/len_all}\")" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 5, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "name": "stderr", 248 | "output_type": "stream", 249 | "text": [ 250 | "2023-09-14 13:19:51::INFO:root:analysis:: Tier1 Failed\n" 251 | ] 252 | }, 253 | { 254 | "name": "stdout", 255 | "output_type": "stream", 256 | "text": [ 257 | "207 targets with landmarks (ping <= 1) or 0.2863070539419087\n", 258 | "419 targets with landmarks (ping <= 5) or 0.5795297372060858\n", 259 | "464 targets with landmarks (ping <= 10) or 0.6417704011065007\n", 260 | "552 targets with landmarks (ping <= 40) or 0.7634854771784232\n", 261 | "723 targets with landmarks (ping <= 9999999999) or 1.0\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "clickhouse_driver = Clickhouse()\n", 267 | "query = clickhouse_driver.get_min_rtt_per_src_dst_prefix_query(TARGET_TO_LANDMARKS_PING_TABLE, filter=\"\", threshold=1000000)\n", 268 | "db_table = clickhouse_driver.execute(query)\n", 269 | "\n", 270 | "rtts = []\n", 271 | "remove_dict = {}\n", 272 | "for l in db_table:\n", 273 | " rtts.append(l[2])\n", 274 | " remove_dict[(l[0], l[1])] = l[2]\n", 275 | "\n", 276 | "error1 = []\n", 277 | "error2 = []\n", 278 | "error3 = []\n", 279 | "error4 = []\n", 280 | "error1ms = []\n", 281 | "error2ms = []\n", 282 | "error5ms = []\n", 283 | "error10ms = []\n", 284 | "\n", 285 | "for _, d in data.items():\n", 286 | " errors = every_tier_result_and_errors(d)\n", 287 | " error1.append(errors['error1'])\n", 288 | " error2.append(errors['error2'])\n", 289 | " error3.append(errors['error3'])\n", 290 | " error4.append(errors['error4'])\n", 291 | " err1ms = 50000\n", 292 | " err2ms = 50000\n", 293 | " err5ms = 50000\n", 294 | " err10ms = 50000\n", 295 | " for f in ['tier2:landmarks', 'tier3:landmarks']:\n", 296 | " if f in d:\n", 297 | " for l_ip, _, l_lat, l_lon in d[f]:\n", 298 | " dist = haversine((l_lat, l_lon), (d['RIPE:lat'], d['RIPE:lon']))\n", 299 | " key_rtt = (l_ip, d['target_ip'])\n", 300 | " if dist < err1ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 1):\n", 301 | " err1ms = dist\n", 302 | " if dist < err2ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 2):\n", 303 | " err2ms = dist\n", 304 | " if dist < err5ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 5):\n", 305 | " err5ms = dist\n", 306 | " if dist < err10ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 10):\n", 307 | " err10ms = dist\n", 308 | " if err1ms != 50000:\n", 309 | " error1ms.append(err1ms)\n", 310 | " else:\n", 311 | " error1ms.append(error1[-1])\n", 312 | " if err2ms != 50000:\n", 313 | " error2ms.append(err2ms)\n", 314 | " else:\n", 315 | " error2ms.append(error1[-1])\n", 316 | " if err5ms != 50000:\n", 317 | " error5ms.append(err5ms)\n", 318 | " else:\n", 319 | " error5ms.append(error1[-1])\n", 320 | " if err10ms != 50000:\n", 321 | " error10ms.append(err10ms)\n", 322 | " else:\n", 323 | " error10ms.append(error1[-1])\n", 324 | "\n", 325 | "for i in [1, 5, 10, 40, 9999999999]:\n", 326 | " c = len([j for j in error1ms if j <= i])\n", 327 | " print(f\"{c} targets with landmarks (ping <= {i}) or {c/len(error1ms)}\")" 328 | ] 329 | } 330 | ], 331 | "metadata": { 332 | "kernelspec": { 333 | "display_name": "review-8XQ99qZ1-py3.10", 334 | "language": "python", 335 | "name": "python3" 336 | }, 337 | "language_info": { 338 | "codemirror_mode": { 339 | "name": "ipython", 340 | "version": 3 341 | }, 342 | "file_extension": ".py", 343 | "mimetype": "text/x-python", 344 | "name": "python", 345 | "nbconvert_exporter": "python", 346 | "pygments_lexer": "ipython3", 347 | "version": "3.9.13" 348 | }, 349 | "orig_nbformat": 4 350 | }, 351 | "nbformat": 4, 352 | "nbformat_minor": 2 353 | } 354 | -------------------------------------------------------------------------------- /clickhouse_files/init-db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | clickhouse client -n <<-EOSQL 5 | CREATE DATABASE IF NOT EXISTS geolocation_replication; 6 | EOSQL 7 | -------------------------------------------------------------------------------- /clickhouse_files/users.d/default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 1 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /default.py: -------------------------------------------------------------------------------- 1 | """All the reference paths to storing files settings and constants""" 2 | 3 | from pathlib import Path 4 | 5 | # Default path 6 | DEFAULT_DIR: Path = Path(__file__).resolve().parent 7 | 8 | 9 | ################################################################################################## 10 | # CONSTANTS # 11 | ################################################################################################## 12 | THRESHOLD_DISTANCES = [0, 40, 100, 500, 1000] 13 | SPEED_OF_LIGHT = 300000 14 | SPEED_OF_INTERNET = SPEED_OF_LIGHT * 2 / 3 15 | 16 | 17 | # Atlas path 18 | ATLAS_PATH: Path = DEFAULT_DIR / "datasets/atlas/" 19 | ################################################################################################## 20 | # REPRODUCIBILITY DATASET FILES (static) # 21 | ################################################################################################## 22 | REPRO_PATH: Path = DEFAULT_DIR / "datasets/reproducibility_datasets/" 23 | REPRO_ATLAS_PATH: Path = REPRO_PATH / "atlas/" 24 | REPRO_GENERATED_PATH: Path = REPRO_PATH / "generated/" 25 | 26 | REPRO_ANCHORS_FILE: Path = REPRO_ATLAS_PATH / "reproducibility_anchors.json" 27 | REPRO_PROBES_FILE: Path = REPRO_ATLAS_PATH / "reproducibility_probes.json" 28 | REPRO_PROBES_AND_ANCHORS_FILE: Path = ( 29 | REPRO_ATLAS_PATH / "reproducibility_probes_and_anchors.json" 30 | ) 31 | 32 | REPRO_PAIRWISE_DISTANCE_FILE: Path = ( 33 | REPRO_GENERATED_PATH / "reproducibility_pairwise_distance_ripe_probes.json" 34 | ) 35 | REPRO_REMOVED_PROBES_FILE: Path = ( 36 | REPRO_GENERATED_PATH / "reproducibility_removed_probes.json" 37 | ) 38 | REPRO_FILTERED_PROBES_FILE: Path = ( 39 | REPRO_GENERATED_PATH / "reproducibility_filtered_probes.json" 40 | ) 41 | REPRO_GREEDY_PROBES_FILE: Path = ( 42 | REPRO_GENERATED_PATH / "reproducibility_greedy_probes.json" 43 | ) 44 | REPRO_HITLIST_FILE: Path = REPRO_GENERATED_PATH / "reproducibility_parsed_hitlist.json" 45 | 46 | 47 | ################################################################################################## 48 | # USER DATASET FILES (generated) # 49 | ################################################################################################## 50 | USER_PATH: Path = DEFAULT_DIR / "datasets/user_datasets/" 51 | USER_ATLAS_PATH: Path = USER_PATH / "atlas/" 52 | USER_GENERATED_PATH: Path = USER_PATH / "generated/" 53 | 54 | USER_ANCHORS_FILE: Path = USER_ATLAS_PATH / "user_anchors.json" 55 | USER_PROBES_FILE: Path = USER_ATLAS_PATH / "user_probes.json" 56 | USER_PROBES_AND_ANCHORS_FILE: Path = USER_ATLAS_PATH / "user_probes_and_anchors.json" 57 | 58 | USER_PAIRWISE_DISTANCE_FILE: Path = ( 59 | USER_GENERATED_PATH / "user_pairwise_distance_ripe_probes.json" 60 | ) 61 | USER_REMOVED_PROBES_FILE: Path = USER_GENERATED_PATH / "user_removed_probes.json" 62 | USER_FILTERED_PROBES_FILE: Path = USER_GENERATED_PATH / "user_filtered_probes.json" 63 | USER_GREEDY_PROBES_FILE: Path = USER_GENERATED_PATH / "user_greedy_probes.json" 64 | USER_HITLIST_FILE: Path = USER_GENERATED_PATH / "user_parsed_hitlist.json" 65 | 66 | ################################################################################################## 67 | # CLICKHOUSE SETTINGS # 68 | ################################################################################################## 69 | CLICKHOUSE_CLIENT = DEFAULT_DIR / "clickhouse_files/clickhouse" 70 | CLICKHOUSE_HOST = "localhost" 71 | CLICKHOUSE_DB = "geolocation_replication" 72 | CLICKHOUSE_USER = "default" 73 | CLICKHOUSE_PASSWORD = "" 74 | 75 | # tables to store reproduction results 76 | ANCHORS_MESHED_PING_TABLE = "anchors_meshed_pings" 77 | ANCHORS_TO_PREFIX_TABLE = "anchors_to_prefix_pings" 78 | PROBES_TO_PREFIX_TABLE = "probes_to_prefix_pings" 79 | TARGET_TO_LANDMARKS_PING_TABLE = "targets_to_landmarks_pings" 80 | PROBES_TO_ANCHORS_PING_TABLE = "ping_10k_to_anchors" 81 | ANCHORS_MESHED_TRACEROUTE_TABLE = "anchors_meshed_traceroutes" 82 | STREET_LEVEL_TRACEROUTES_TABLE = "street_lvl_traceroutes" 83 | 84 | # tables to store user measurements 85 | USER_VPS_TO_PREFIX_TABLE = "user_vps_to_prefix" 86 | USER_VPS_TO_TARGET_TABLE = "user_vps_to_target" 87 | 88 | USER_TARGET_TO_LANDMARKS_PING_TABLE = "user_targets_to_landmarks_pings" 89 | USER_ANCHORS_MESHED_TRACEROUTE_TABLE = "user_anchors_meshed_traceroutes" 90 | USER_STREET_LEVEL_TRACEROUTES_TABLE = "user_street_lvl_traceroutes" 91 | 92 | # reproduction results files 93 | CLICKHOUSE_STATIC_DATASET: Path = DEFAULT_DIR / "datasets/clickhouse_data" 94 | 95 | ANCHORS_MESHED_PING_FILE = ( 96 | CLICKHOUSE_STATIC_DATASET / f"{ANCHORS_MESHED_PING_TABLE}.zst" 97 | ) 98 | ANCHORS_TO_PREFIX_FILE = CLICKHOUSE_STATIC_DATASET / f"{ANCHORS_TO_PREFIX_TABLE}.zst" 99 | PROBES_TO_PREFIX_FILE = CLICKHOUSE_STATIC_DATASET / f"{PROBES_TO_PREFIX_TABLE}.zst" 100 | TARGET_TO_LANDMARKS_PING_FILE = ( 101 | CLICKHOUSE_STATIC_DATASET / f"{TARGET_TO_LANDMARKS_PING_TABLE}.zst" 102 | ) 103 | PROBES_TO_ANCHORS_PING_FILE = ( 104 | CLICKHOUSE_STATIC_DATASET / f"{PROBES_TO_ANCHORS_PING_TABLE}.zst" 105 | ) 106 | ANCHORS_MESHED_TRACEROUTE_FILE = ( 107 | CLICKHOUSE_STATIC_DATASET / f"{ANCHORS_MESHED_TRACEROUTE_TABLE}.zst" 108 | ) 109 | STREET_LEVEL_TRACEROUTES_FILE = ( 110 | CLICKHOUSE_STATIC_DATASET / f"{STREET_LEVEL_TRACEROUTES_TABLE}.zst" 111 | ) 112 | 113 | 114 | ################################################################################################## 115 | # RIPE ATLAS VPS BIAS ANALYSIS # 116 | ################################################################################################## 117 | ASNS_TYPES: Path = DEFAULT_DIR / "datasets/asns_types" 118 | ASNS_TYPE_CAIDA: Path = ASNS_TYPES / "caida_enhanced_as_type.json" 119 | ASNS_TYPE_STANFORD: Path = ASNS_TYPES / "AS_categories_stanford.json" 120 | 121 | 122 | ################################################################################################## 123 | # STATIC FILES # 124 | ################################################################################################## 125 | STATIC_PATH: Path = DEFAULT_DIR / "datasets/static_datasets/" 126 | 127 | COUNTRIES_JSON_FILE: Path = STATIC_PATH / "countries.json" 128 | COUNTRIES_TXT_FILE: Path = STATIC_PATH / "countries.txt" 129 | COUNTRIES_CSV_FILE: Path = STATIC_PATH / "iso_code_2.csv" 130 | POPULATION_CITY_FILE: Path = STATIC_PATH / "population.json" 131 | CITIES_500_FILE: Path = STATIC_PATH / "cities500.txt" 132 | POPULATION_DENSITY_FILE: Path = ( 133 | STATIC_PATH / "gpw_v4_population_density_rev11_2020_30_sec.tif" 134 | ) 135 | 136 | ADDRESS_FILE: Path = ( 137 | STATIC_PATH / "internet_address_verfploeter_hitlist_it102w-20230125.fsdb" 138 | ) 139 | GEOLITE_FILE: Path = STATIC_PATH / "GeoLite2-City-Blocks-IPv4_20230516.tree" 140 | IP_INFO_GEO_FILE: Path = STATIC_PATH / "ip_info_geo_anchors.json" 141 | MAXMIND_GEO_FILE: Path = STATIC_PATH / "maxmind_free_geo_anchors.json" 142 | 143 | GEOPAPIFY_1_FILE: Path = STATIC_PATH / "geocoded_by_geoapify-10_05_2023_0_500.csv" 144 | GEOPAPIFY_2_FILE: Path = STATIC_PATH / "geocoded_by_geoapify-10_05_2023_500_last.csv" 145 | 146 | IP_TO_ASN_FILE: Path = STATIC_PATH / "2022-03-28.dat" 147 | ANCHORS_SECOND_PAPER_FILE: Path = STATIC_PATH / "anchors_ip_list.json" 148 | CACHED_WEBSITES_FILE: Path = STATIC_PATH / "websites.json" 149 | BGP_PRIFIXES_FILE: Path = STATIC_PATH / "bgp_prefixes.json" 150 | 151 | ################################################################################################## 152 | # ANALYSIS RESULTS FILES # 153 | ################################################################################################## 154 | 155 | # REPRODUCIBILITY 156 | REPRO_ANALYSIS_PATH: Path = DEFAULT_DIR / "analysis/results/reproducibility/" 157 | 158 | REPRO_PROBES_TO_ANCHORS_RESULT_FILE: Path = ( 159 | REPRO_ANALYSIS_PATH / "cbg_thresholds_probes_to_anchors.json" 160 | ) 161 | REPRO_VP_SELECTION_ALGORITHM_PROBES_1_FILE: Path = ( 162 | REPRO_ANALYSIS_PATH / "vp_selection_algorithm_probes_1.json" 163 | ) 164 | REPRO_VP_SELECTION_ALGORITHM_PROBES_3_FILE: Path = ( 165 | REPRO_ANALYSIS_PATH / "vp_selection_algorithm_probes_3.json" 166 | ) 167 | REPRO_VP_SELECTION_ALGORITHM_PROBES_10_FILE: Path = ( 168 | REPRO_ANALYSIS_PATH / "vp_selection_algoxrithm_probes_10.json" 169 | ) 170 | REPRO_ACCURACY_VS_N_VPS_PROBES_FILE: Path = ( 171 | REPRO_ANALYSIS_PATH / "accuracy_vs_n_vps_probes.json" 172 | ) 173 | REPRO_ROUND_BASED_ALGORITHM_FILE: Path = ( 174 | REPRO_ANALYSIS_PATH / "round_based_algorithm_error_cdf.json" 175 | ) 176 | 177 | # FROM USER MEASUREMENTS 178 | USER_ANALYSIS_PATH: Path = DEFAULT_DIR / "analysis/results/user/" 179 | 180 | USER_PROBES_TO_ANCHORS_RESULT_FILE: Path = ( 181 | USER_ANALYSIS_PATH / "cbg_thresholds_probes_to_anchors.json" 182 | ) 183 | USER_VP_SELECTION_ALGORITHM_PROBES_1_FILE: Path = ( 184 | USER_ANALYSIS_PATH / "vp_selection_algorithm_probes_1.json" 185 | ) 186 | USER_VP_SELECTION_ALGORITHM_PROBES_3_FILE: Path = ( 187 | USER_ANALYSIS_PATH / "vp_selection_algorithm_probes_3.json" 188 | ) 189 | USER_VP_SELECTION_ALGORITHM_PROBES_10_FILE: Path = ( 190 | USER_ANALYSIS_PATH / "vp_selection_algoxrithm_probes_10.json" 191 | ) 192 | USER_ACCURACY_VS_N_VPS_PROBES_FILE: Path = ( 193 | USER_ANALYSIS_PATH / "accuracy_vs_n_vps_probes.json" 194 | ) 195 | USER_ROUND_BASED_ALGORITHM_FILE: Path = ( 196 | USER_ANALYSIS_PATH / "round_based_algorithm_error_cdf.json" 197 | ) 198 | 199 | ################################################################################################## 200 | # MEASUREMENTS RESULTS FILES # 201 | ################################################################################################## 202 | MEASUREMENTS_MILLION_SCALE_PATH: Path = ( 203 | DEFAULT_DIR / "measurements/results/million_scale/" 204 | ) 205 | MEASUREMENTS_STREET_LEVEL_PATH: Path = ( 206 | DEFAULT_DIR / "measurements/results/street_level/" 207 | ) 208 | MEASUREMENT_CONFIG_PATH: Path = ( 209 | DEFAULT_DIR / "measurements/results/million_scale/measurement_config/" 210 | ) 211 | 212 | ############## MILLION SCALE FILES 213 | PREFIX_MEASUREMENT_RESULTS: Path = ( 214 | MEASUREMENTS_MILLION_SCALE_PATH / "prefix_measurement_results.json" 215 | ) 216 | TARGET_MEASUREMENT_RESULTS: Path = ( 217 | MEASUREMENTS_MILLION_SCALE_PATH / "target_measurement_results.json" 218 | ) 219 | 220 | ############## STREET LEVEL FILES 221 | ANALYZABLE_FILE: Path = MEASUREMENTS_STREET_LEVEL_PATH / "all_res.json" 222 | 223 | 224 | ################################################################################################## 225 | # FIGURES FILES # 226 | ################################################################################################## 227 | 228 | # REPRODUCIBILITY 229 | REPRO_FIGURE_PATH: Path = DEFAULT_DIR / "analysis/figures/reproducibility" 230 | 231 | REPRO_GEO_DATABASE_FILE: Path = REPRO_FIGURE_PATH / "geo_databases.pdf" 232 | REPRO_ACCURACY_VS_NB_VPS_FILE: Path = REPRO_FIGURE_PATH / "accuracy_vs_n_vps_probes.pdf" 233 | REPRO_ACCURACY_VS_SUBSET_SIZES_FILE: Path = ( 234 | REPRO_FIGURE_PATH / "accuracy_vs_subset_sizes.pdf" 235 | ) 236 | REPRO_CBG_THRESHOLD_PROBES_FILE: Path = REPRO_FIGURE_PATH / "cbg_thresholds_probes.pdf" 237 | REPRO_CBG_THRESHOLD_VP_SELECTION_FILE: Path = ( 238 | REPRO_FIGURE_PATH / "cbg_thresholds_vp_selection.pdf" 239 | ) 240 | REPRO_CBG_THRESHOLD_CONTINENT_FILE: Path = ( 241 | REPRO_FIGURE_PATH / "cbg_thresholds_continent.pdf" 242 | ) 243 | REPRO_ROUND_ALGORITHM_ERROR_FILE: Path = REPRO_FIGURE_PATH / "round_algorithm_error.pdf" 244 | REPRO_CLOSE_LANDMARK_FILE: Path = REPRO_FIGURE_PATH / "cdf_close_landmark_check_log.pdf" 245 | REPRO_INVALID_RTT_FILE: Path = REPRO_FIGURE_PATH / "invalid_rtt.pdf" 246 | REPRO_TIME_TO_GEOLOCATE_FILE: Path = REPRO_FIGURE_PATH / "cdf_time_to_geolocate.pdf" 247 | REPRO_SCATTER_DISTANCE_FILE: Path = REPRO_FIGURE_PATH / "scatter_md_vs_d.pdf" 248 | REPRO_SCATTER_DENSITY_FILE: Path = REPRO_FIGURE_PATH / "scatter_density.pdf" 249 | REPRO_CDF_DENSITY_FILE: Path = REPRO_FIGURE_PATH / "cdf_density.pdf" 250 | 251 | # FROM USER MEASUREMENTS 252 | USER_FIGURE_PATH: Path = DEFAULT_DIR / "analysis/figures/user" 253 | 254 | REPRO_GEO_DATABASE_FILE: Path = USER_FIGURE_PATH / "geo_databases.pdf" 255 | USER_ACCURACY_VS_NB_VPS_FILE: Path = USER_FIGURE_PATH / "accuracy_vs_n_vps_probes.pdf" 256 | USER_ACCURACY_VS_SUBSET_SIZES_FILE: Path = ( 257 | USER_FIGURE_PATH / "accuracy_vs_subset_sizes.pdf" 258 | ) 259 | USER_CBG_THRESHOLD_PROBES_FILE: Path = USER_FIGURE_PATH / "cbg_thresholds_probes.pdf" 260 | USER_CBG_THRESHOLD_VP_SELECTION_FILE: Path = ( 261 | USER_FIGURE_PATH / "cbg_thresholds_vp_selection.pdf" 262 | ) 263 | USER_CBG_THRESHOLD_CONTINENT_FILE: Path = ( 264 | USER_FIGURE_PATH / "cbg_thresholds_continent.pdf" 265 | ) 266 | USER_ROUND_ALGORITHM_ERROR_FILE: Path = USER_FIGURE_PATH / "round_algorithm_error.pdf" 267 | USER_CLOSE_LANDMARK_FILE: Path = USER_FIGURE_PATH / "cdf_close_landmark_check_log.pdf" 268 | USER_INVALID_RTT_FILE: Path = USER_FIGURE_PATH / "invalid_rtt.pdf" 269 | USER_TIME_TO_GEOLOCATE_FILE: Path = USER_FIGURE_PATH / "cdf_time_to_geolocate.pdf" 270 | USER_SCATTER_DISTANCE_FILE: Path = USER_FIGURE_PATH / "scatter_md_vs_d.pdf" 271 | USER_SCATTER_DENSITY_FILE: Path = USER_FIGURE_PATH / "scatter_density.pdf" 272 | USER_CDF_DENSITY_FILE: Path = USER_FIGURE_PATH / "cdf_density.pdf" 273 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | # pull the docker image 2 | docker pull clickhouse/clickhouse-server:22.6 3 | 4 | 5 | # start the server using docker 6 | docker run --rm -d \ 7 | -v ./clickhouse_files/data:/var/lib/clickhouse/ \ 8 | -v ./clickhouse_files/logs:/var/log/clickhouse-server/ \ 9 | -v ./clickhouse_files/users.d:/etc/clickhouse-server/users.d:ro \ 10 | -v ./clickhouse_files/init-db.sh:/docker-entrypoint-initdb.d/init-db.sh \ 11 | -p 8123:8123 \ 12 | -p 9000:9000 \ 13 | --ulimit nofile=262144:262144 \ 14 | clickhouse/clickhouse-server:22.6 15 | 16 | # download clickhouse client binary 17 | curl https://clickhouse.com/ | sh 18 | mv clickhouse ./clickhouse_files/ 19 | 20 | # install source files 21 | poetry lock 22 | poetry install 23 | 24 | # run clickhouse db installer for table init 25 | poetry run python scripts/utils/clickhouse_installer.py 26 | -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig( 4 | format="%(asctime)s::%(levelname)s:%(name)s:%(module)s:: %(message)s", 5 | level=logging.INFO, 6 | datefmt="%Y-%m-%d %H:%M:%S", 7 | ) 8 | 9 | logger = logging.getLogger() 10 | -------------------------------------------------------------------------------- /measurements/landmark_traceroutes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Probing part 4\n", 8 | "\n", 9 | "Vantage points will probe the targets in a 3-step method, either by doing pings or traceroutes. \n", 10 | "\n", 11 | "Vantage points are the Ripe Atlas anchors, then indireclty some online landmarks. \n", 12 | "As always, targets are the anchors. \n", 13 | "\n", 14 | "This notebook is an implementation of the street level method. Check the paper for more information. \n", 15 | "To do after create_datasets.ipynb" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import traceback\n", 25 | "\n", 26 | "from pprint import pprint\n", 27 | "from clickhouse_driver import Client\n", 28 | "\n", 29 | "from scripts.utils.file_utils import load_json, dump_json\n", 30 | "from scripts.utils.measurement_utils import load_vps\n", 31 | "from scripts.utils.helpers import haversine\n", 32 | "from scripts.street_level.traceroutes_results import serialize\n", 33 | "from scripts.street_level.three_tiers import get_all_info_geoloc\n", 34 | "from default import USER_ANCHORS_FILE, ANALYZABLE_FILE\n", 35 | "\n", 36 | "NB_VP = 10" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### database for traceroutes" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "### Main\n", 51 | "\n", 52 | "This would take a lot of time (more than a day if you use all the VPs)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# Anchors are the targets and Vantage points\n", 62 | "anchors = load_vps(USER_ANCHORS_FILE, nb_vps=NB_VP)\n", 63 | "try:\n", 64 | " all_res = load_json(ANALYZABLE_FILE)\n", 65 | "except FileNotFoundError:\n", 66 | " all_res = {}\n", 67 | "\n", 68 | "i = 0\n", 69 | "for target in anchors.values():\n", 70 | " try:\n", 71 | " target_ip = target['address_v4']\n", 72 | " if target_ip in all_res: # we skip targets already geolocated\n", 73 | " continue\n", 74 | " print(f\"{i}:{target_ip}\")\n", 75 | " i += 1\n", 76 | "\n", 77 | " res = get_all_info_geoloc(target_ip, vps=anchors.values())\n", 78 | " res = serialize(res)\n", 79 | " # We save the coordinates of the targets as given by RIPE Atlas\n", 80 | " res['RIPE:lat'] = target['geometry']['coordinates'][1]\n", 81 | " res['RIPE:lon'] = target['geometry']['coordinates'][0]\n", 82 | "\n", 83 | " # We save the error of the estimated geolocation at each step\n", 84 | " if res['lat'] != None and res['lon'] != None:\n", 85 | " res['error'] = haversine(\n", 86 | " (res['lat'], res['lon']), (res['RIPE:lat'], res['RIPE:lon']))\n", 87 | " if 'tier1:lat' in res and 'tier1:lon' in res and res['tier1:lat'] != None and res['tier1:lon'] != None:\n", 88 | " res['tier1:error'] = haversine(\n", 89 | " (res['tier1:lat'], res['tier1:lon']), (res['RIPE:lat'], res['RIPE:lon']))\n", 90 | " if 'tier2:lat' in res and 'tier2:lon' in res and res['tier2:lat'] != None and res['tier2:lon'] != None:\n", 91 | " res['tier2:error'] = haversine(\n", 92 | " (res['tier2:lat'], res['tier2:lon']), (res['RIPE:lat'], res['RIPE:lon']))\n", 93 | " if 'tier3:lat' in res and 'tier3:lon' in res and res['tier3:lat'] != None and res['tier3:lon'] != None:\n", 94 | " res['tier3:error'] = haversine(\n", 95 | " (res['tier3:lat'], res['tier3:lon']), (res['RIPE:lat'], res['RIPE:lon']))\n", 96 | "\n", 97 | " all_res[target_ip] = res\n", 98 | " # We save the results\n", 99 | " dump_json(all_res, ANALYZABLE_FILE)\n", 100 | " except Exception:\n", 101 | " traceback.print_exc()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Geolocat one IP" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "{'target_ip': '195.83.132.129', 'tier1:done': False, 'tier2:done': False, 'tier3:done': False, 'negative_rtt_included': True, 'speed_threshold': 0.6666666666666666, 'tier1:lat': None, 'tier1:lon': None, 'vps': set(), 'tier1:duration': 1282.0457310676575, 'lat': None, 'lon': None}\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "target_ip = '195.83.132.129' # LAAS/CNRS\n", 126 | "geolocation = get_all_info_geoloc(target_ip)\n", 127 | "#geolocation = geoloc(target_ip)\n", 128 | "print(geolocation)\n", 129 | "geolocation = serialize(geolocation)\n", 130 | "dump_json(geolocation, 'res_tmp.json')" 131 | ] 132 | } 133 | ], 134 | "metadata": { 135 | "kernelspec": { 136 | "display_name": "review-8XQ99qZ1-py3.10", 137 | "language": "python", 138 | "name": "python3" 139 | }, 140 | "language_info": { 141 | "codemirror_mode": { 142 | "name": "ipython", 143 | "version": 3 144 | }, 145 | "file_extension": ".py", 146 | "mimetype": "text/x-python", 147 | "name": "python", 148 | "nbconvert_exporter": "python", 149 | "pygments_lexer": "ipython3", 150 | "version": "3.9.13" 151 | }, 152 | "orig_nbformat": 4 153 | }, 154 | "nbformat": 4, 155 | "nbformat_minor": 2 156 | } 157 | -------------------------------------------------------------------------------- /measurements/million_scale_measurements.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Probing part 1\n", 8 | "\n", 9 | "Vantage points will probe either the targets themselves (step 2) or other addresses in the same /24 prefix (step 2).\n", 10 | "\n", 11 | "Vantage points are only the anchors. \n", 12 | "As always, targets are the anchors. \n", 13 | "\n", 14 | "This notebook is an implementation of the million scale method. Check the paper for more information. \n", 15 | "To do after create_datasets.ipynb" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import uuid\n", 25 | "\n", 26 | "from logger import logger\n", 27 | "from scripts.utils.file_utils import load_json\n", 28 | "from scripts.utils.measurement_utils import (\n", 29 | " load_targets,\n", 30 | " load_vps,\n", 31 | " get_measurement_config,\n", 32 | " save_measurement_config,\n", 33 | " get_target_prefixes,\n", 34 | " ping_prefixes,\n", 35 | " ping_targets,\n", 36 | ")\n", 37 | "from default import (\n", 38 | " USER_ANCHORS_FILE,\n", 39 | " USER_HITLIST_FILE,\n", 40 | " MEASUREMENT_CONFIG_PATH,\n", 41 | ")\n", 42 | "\n", 43 | "# will define the number of vps and targets to use\n", 44 | "NB_TARGETS = 2\n", 45 | "NB_VPS = 4" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## Load targets and vps dataset" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "targets = load_targets(USER_ANCHORS_FILE, nb_target=NB_TARGETS)\n", 62 | "vps = load_vps(USER_ANCHORS_FILE, nb_vps=NB_VPS)\n", 63 | "\n", 64 | "# every anchors /24 subnet\n", 65 | "target_addrs = [t[\"address_v4\"] for t in targets]\n", 66 | "target_prefixes = get_target_prefixes(target_addrs)\n", 67 | "\n", 68 | "# responsive IP addresses in all /24 prefixes\n", 69 | "targets_per_prefix = load_json(USER_HITLIST_FILE)\n", 70 | "\n", 71 | "logger.info(f\"nb targets: {len(targets)}\")\n", 72 | "logger.info(f\"nb_vps : {len(vps)}\")" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## Generate measurement config\n", 80 | "\n", 81 | "This configuration is used to retrieve all measurements results from RIPE Atlas using their API." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 3, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stderr", 91 | "output_type": "stream", 92 | "text": [ 93 | "2024-10-04 13:13:06::INFO:root:3171606573:: Starting experiment with uuid : c78efe35-8089-41a9-9206-ac7bac4a8a68\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "# measurement configuration for retrieval\n", 99 | "experiment_uuid = str(uuid.uuid4())\n", 100 | "target_measurement_uuid = str(uuid.uuid4())\n", 101 | "prefix_measurement_uuid = str(uuid.uuid4())\n", 102 | "\n", 103 | "config_file_path = MEASUREMENT_CONFIG_PATH / f\"{experiment_uuid}.json\"\n", 104 | "\n", 105 | "logger.info(f\"Starting experiment with uuid : {experiment_uuid}\")\n", 106 | "\n", 107 | "measurement_config = get_measurement_config(\n", 108 | " experiment_uuid=experiment_uuid,\n", 109 | " target_measurement_uuid=target_measurement_uuid,\n", 110 | " prefix_measurement_uuid=prefix_measurement_uuid,\n", 111 | " targets=targets,\n", 112 | " target_prefixes=target_prefixes,\n", 113 | " vps=vps,\n", 114 | ")\n", 115 | "\n", 116 | "save_measurement_config(measurement_config, config_file_path)" 117 | ] 118 | }, 119 | { 120 | "attachments": {}, 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "# Step 1: probing each target prefixes" 125 | ] 126 | }, 127 | { 128 | "attachments": {}, 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## Probe target prefixes\n", 133 | "WARNING : Time consumming section" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 4, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stderr", 143 | "output_type": "stream", 144 | "text": [ 145 | "2024-10-04 13:13:06::INFO:root:measurement_utils:: No cached results available\n", 146 | "2024-10-04 13:13:06::INFO:root:measurement_utils:: Starting measurements dd2e9428-762d-4353-99ca-613057d430a3 with parameters: dry_run=False; nb_targets=2; nb_vps=4.\n", 147 | "2024-10-04 13:13:06::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942232\n", 148 | "2024-10-04 13:13:08::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942233\n", 149 | "2024-10-04 13:13:08::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942234\n", 150 | "2024-10-04 13:13:08::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942235\n", 151 | "2024-10-04 13:13:09::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942236\n", 152 | "2024-10-04 13:13:09::INFO:root:ping_and_traceroute_classes:: measurement tag: dd2e9428-762d-4353-99ca-613057d430a3 : started measurement id : 79942237\n", 153 | "2024-10-04 13:13:09::INFO:root:ping_and_traceroute_classes:: measurement : dd2e9428-762d-4353-99ca-613057d430a3 done\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "ping_prefixes(\n", 159 | " measurement_uuid=prefix_measurement_uuid,\n", 160 | " measurement_config=measurement_config,\n", 161 | " target_prefixes=target_prefixes,\n", 162 | " targets_per_prefix=targets_per_prefix,\n", 163 | " vps=vps,\n", 164 | ")\n", 165 | "\n", 166 | "save_measurement_config(measurement_config, config_file_path)" 167 | ] 168 | }, 169 | { 170 | "attachments": {}, 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "# Step 2: probing each target" 175 | ] 176 | }, 177 | { 178 | "attachments": {}, 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "## Probe targets\n", 183 | "WARNING : Time consumming section" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 6, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "name": "stderr", 193 | "output_type": "stream", 194 | "text": [ 195 | "2024-10-04 13:13:11::INFO:root:measurement_utils:: Starting measurements 6796bfe3-7137-43f1-9f9f-71e0a141157d with parameters: dry_run=False; nb_targets=6; nb_vps=4.\n", 196 | "2024-10-04 13:13:12::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942244\n", 197 | "2024-10-04 13:13:12::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942245\n", 198 | "2024-10-04 13:13:12::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942246\n", 199 | "2024-10-04 13:13:13::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942247\n", 200 | "2024-10-04 13:13:13::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942248\n", 201 | "2024-10-04 13:13:14::INFO:root:ping_and_traceroute_classes:: measurement tag: 6796bfe3-7137-43f1-9f9f-71e0a141157d : started measurement id : 79942249\n", 202 | "2024-10-04 13:13:14::INFO:root:ping_and_traceroute_classes:: measurement : 6796bfe3-7137-43f1-9f9f-71e0a141157d done\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "# measurement configuration for retrieval\n", 208 | "ping_targets(\n", 209 | " measurement_uuid=target_measurement_uuid,\n", 210 | " measurement_config=measurement_config,\n", 211 | " targets=targets,\n", 212 | " vps=vps,\n", 213 | " use_cache=False,\n", 214 | ")\n", 215 | "save_measurement_config(measurement_config, config_file_path)" 216 | ] 217 | }, 218 | { 219 | "attachments": {}, 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "## Retrieve prefix results\n", 224 | "WARNING : Time consuming section\n", 225 | "\n", 226 | "Note: it might take some time before measurement results are available through RIPE API. If no results are available, retry after a few minutes (or hours, it might really depends on the probe itself)." 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 7, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "from logger import logger\n", 236 | "from scripts.utils.file_utils import load_json\n", 237 | "from scripts.utils.measurement_utils import (\n", 238 | " retrieve_results,\n", 239 | " insert_prefix_results,\n", 240 | " insert_target_results,\n", 241 | ")\n", 242 | "from default import (\n", 243 | " PREFIX_MEASUREMENT_RESULTS,\n", 244 | " TARGET_MEASUREMENT_RESULTS,\n", 245 | ")\n", 246 | "\n", 247 | "# will define the number of vps and targets to use\n", 248 | "NB_TARGETS = 2\n", 249 | "NB_VPS = 4" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 8, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stderr", 259 | "output_type": "stream", 260 | "text": [ 261 | "2024-10-04 13:13:14::INFO:root:3539837011:: {'experiment_uuid': 'c78efe35-8089-41a9-9206-ac7bac4a8a68', 'status': 'ongoing', 'start_time': '2024-10-04 13:13:06.112516', 'end_time': None, 'is_dry_run': False, 'nb_targets': 2, 'nb_vps': 4, 'description': 'measurements from a set of vps towards all targets/target prefixes', 'af': 4, 'target_measurements': {'measurement_uuid': '6796bfe3-7137-43f1-9f9f-71e0a141157d', 'targets': ['103.196.37.98', '195.246.236.1', '77.220.233.1', '185.230.79.16', '185.34.2.114', '217.25.179.62'], 'vps': {'77.220.233.1': {'id': 6824, 'address_v4': '77.220.233.1', 'asn_v4': 42699, 'country_code': 'DE', 'geometry': {'type': 'Point', 'coordinates': [13.7285, 51.0395]}}, '185.230.79.16': {'id': 7122, 'address_v4': '185.230.79.16', 'asn_v4': 204515, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.1585, 48.7085]}}, '185.34.2.114': {'id': 6798, 'address_v4': '185.34.2.114', 'asn_v4': 36236, 'country_code': 'AE', 'geometry': {'type': 'Point', 'coordinates': [55.8115, 25.6315]}}, '217.25.179.62': {'id': 7042, 'address_v4': '217.25.179.62', 'asn_v4': 24776, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.3695, 48.9085]}}}, 'end_time': 1728047594.2916105, 'start_time': 1728047591.8001034}, 'prefix_measurements': {'measurement_uuid': 'dd2e9428-762d-4353-99ca-613057d430a3', 'targets': ['103.196.37.0', '195.246.236.0'], 'vps': {'77.220.233.1': {'id': 6824, 'address_v4': '77.220.233.1', 'asn_v4': 42699, 'country_code': 'DE', 'geometry': {'type': 'Point', 'coordinates': [13.7285, 51.0395]}}, '185.230.79.16': {'id': 7122, 'address_v4': '185.230.79.16', 'asn_v4': 204515, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.1585, 48.7085]}}, '185.34.2.114': {'id': 6798, 'address_v4': '185.34.2.114', 'asn_v4': 36236, 'country_code': 'AE', 'geometry': {'type': 'Point', 'coordinates': [55.8115, 25.6315]}}, '217.25.179.62': {'id': 7042, 'address_v4': '217.25.179.62', 'asn_v4': 24776, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.3695, 48.9085]}}}, 'end_time': 1728047589.574289, 'start_time': 1728047586.1349247}, 'meshed_measurements': {'measurement_uuid': '805d6778-9e09-4be7-9c43-d4aafc813a10', 'targets': ['103.196.37.98', '195.246.236.1', '77.220.233.1', '185.230.79.16', '185.34.2.114', '217.25.179.62'], 'vps': {'77.220.233.1': {'id': 6824, 'address_v4': '77.220.233.1', 'asn_v4': 42699, 'country_code': 'DE', 'geometry': {'type': 'Point', 'coordinates': [13.7285, 51.0395]}}, '185.230.79.16': {'id': 7122, 'address_v4': '185.230.79.16', 'asn_v4': 204515, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.1585, 48.7085]}}, '185.34.2.114': {'id': 6798, 'address_v4': '185.34.2.114', 'asn_v4': 36236, 'country_code': 'AE', 'geometry': {'type': 'Point', 'coordinates': [55.8115, 25.6315]}}, '217.25.179.62': {'id': 7042, 'address_v4': '217.25.179.62', 'asn_v4': 24776, 'country_code': 'FR', 'geometry': {'type': 'Point', 'coordinates': [2.3695, 48.9085]}}}, 'end_time': 1728047591.7847333, 'start_time': 1728047589.5959833}}\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "measurement_config = load_json(config_file_path)\n", 267 | "logger.info(measurement_config)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 9, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stderr", 277 | "output_type": "stream", 278 | "text": [ 279 | "2024-10-04 13:13:14::INFO:root:1680719454:: retrieving results for measurement ids: dd2e9428-762d-4353-99ca-613057d430a3\n", 280 | "2024-10-04 13:13:15::INFO:root:measurement_utils:: nb measurements retrieved: 0 for measurement_uuid : dd2e9428-762d-4353-99ca-613057d430a3\n" 281 | ] 282 | }, 283 | { 284 | "ename": "UnboundLocalError", 285 | "evalue": "local variable 'result' referenced before assignment", 286 | "output_type": "error", 287 | "traceback": [ 288 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 289 | "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)", 290 | "Cell \u001b[0;32mIn[9], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# sometimes, not all probes give output, reduce timeout if you do not want to wait for too long\u001b[39;00m\n\u001b[1;32m 6\u001b[0m response \u001b[38;5;241m=\u001b[39m retrieve_results(prefix_measurement_uuid, PREFIX_MEASUREMENT_RESULTS)\n\u001b[0;32m----> 8\u001b[0m \u001b[43minsert_prefix_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresponse\u001b[49m\u001b[43m)\u001b[49m\n", 291 | "File \u001b[0;32m/storage/hugo/geoloc-imc-2023/scripts/utils/measurement_utils.py:324\u001b[0m, in \u001b[0;36minsert_prefix_results\u001b[0;34m(results)\u001b[0m\n\u001b[1;32m 319\u001b[0m values_description \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 320\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msrc, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 321\u001b[0m )\n\u001b[1;32m 323\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m results:\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mno data to insert, data = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mresult\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 326\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m result \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[1;32m 327\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 328\u001b[0m \u001b[38;5;66;03m# parse response\u001b[39;00m\n", 292 | "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'result' referenced before assignment" 293 | ] 294 | } 295 | ], 296 | "source": [ 297 | "prefix_measurement_uuid = measurement_config[\"prefix_measurements\"][\"measurement_uuid\"]\n", 298 | "\n", 299 | "logger.info(f\"retrieving results for measurement ids: {prefix_measurement_uuid}\")\n", 300 | "\n", 301 | "# sometimes, not all probes give output, reduce timeout if you do not want to wait for too long\n", 302 | "response = retrieve_results(prefix_measurement_uuid, PREFIX_MEASUREMENT_RESULTS)\n", 303 | "\n", 304 | "insert_prefix_results(response)" 305 | ] 306 | }, 307 | { 308 | "attachments": {}, 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "## Retrieve traget results\n", 313 | "WARNING : Time consumming section\n", 314 | "\n", 315 | "Note: it might take some time before measurement results are available through RIPE API. If no results are available, retry after a few minutes (or hours, it might really depends on the probe itself)." 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 13, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "name": "stderr", 325 | "output_type": "stream", 326 | "text": [ 327 | "2024-10-03 18:08:53::INFO:root:3802694766:: retrieving results for measurement ids: 18020ef4-fcc5-410b-9eb1-9ab3a18dd3a3\n", 328 | "2024-10-03 18:08:53::INFO:root:measurement_utils:: nb measurements retrieved: 20 for measurement_uuid : 18020ef4-fcc5-410b-9eb1-9ab3a18dd3a3\n", 329 | "2024-10-03 18:08:53::INFO:root:measurement_utils:: Target measurements successfully inserted in table : user_vps_to_target\n" 330 | ] 331 | } 332 | ], 333 | "source": [ 334 | "target_measurement_uuid = measurement_config[\"target_measurements\"][\"measurement_uuid\"]\n", 335 | "\n", 336 | "logger.info(f\"retrieving results for measurement ids: {target_measurement_uuid}\")\n", 337 | "\n", 338 | "response = retrieve_results(target_measurement_uuid, TARGET_MEASUREMENT_RESULTS)\n", 339 | "\n", 340 | "insert_target_results(response)" 341 | ] 342 | } 343 | ], 344 | "metadata": { 345 | "kernelspec": { 346 | "display_name": "geoloc-imc-2023-GZT64Hva-py3.10", 347 | "language": "python", 348 | "name": "python3" 349 | }, 350 | "language_info": { 351 | "codemirror_mode": { 352 | "name": "ipython", 353 | "version": 3 354 | }, 355 | "file_extension": ".py", 356 | "mimetype": "text/x-python", 357 | "name": "python", 358 | "nbconvert_exporter": "python", 359 | "pygments_lexer": "ipython3", 360 | "version": "3.10.12" 361 | }, 362 | "orig_nbformat": 4 363 | }, 364 | "nbformat": 4, 365 | "nbformat_minor": 2 366 | } 367 | -------------------------------------------------------------------------------- /measurements/million_scale_measurements.py: -------------------------------------------------------------------------------- 1 | """perform a meshed ping measurement where each VP is probed by every others""" 2 | 3 | from logger import logger 4 | 5 | from scripts.utils.file_utils import load_json 6 | from scripts.utils.measurement_utils import ( 7 | load_targets, 8 | load_vps, 9 | get_measurement_config, 10 | save_measurement_config, 11 | get_target_prefixes, 12 | ping_prefixes, 13 | ping_targets, 14 | retrieve_results, 15 | insert_prefix_results, 16 | insert_target_results, 17 | ) 18 | from default import ( 19 | USER_ANCHORS_FILE, 20 | USER_HITLIST_FILE, 21 | PREFIX_MEASUREMENT_RESULTS, 22 | TARGET_MEASUREMENT_RESULTS, 23 | MEASUREMENT_CONFIG_PATH, 24 | ) 25 | 26 | # Small number of targets and VPs for testing 27 | # Change to real Anchors and VPs values for complete measurement 28 | NB_TARGETS = 5 29 | NB_VPS = 10 30 | 31 | # measurement configuration for retrieval, 32 | # replace if you want to create new batch of measurements 33 | EXPERIMENT_UUID = "3992e46c-73cf-4a7b-9428-3198856039a9" 34 | TARGET_MESAUREMENT_UUID = "03eb9559-88fe-41cb-b62c-4c07d1d5acb8" 35 | PREFIX_MESAUREMENT_UUID = "a09709aa-be76-4687-852e-64e8090bee70" 36 | CONFIG_PATH = MEASUREMENT_CONFIG_PATH / f"{EXPERIMENT_UUID}.json" 37 | 38 | 39 | def main_measurements() -> None: 40 | """perform all measurements related with million scale""" 41 | # set any of these var to execute the corresponding fct 42 | do_target_pings = True 43 | do_target_prefix_pings = True 44 | 45 | # load targets and VPs 46 | targets = load_targets(USER_ANCHORS_FILE, nb_target=NB_TARGETS) 47 | vps = load_vps(USER_ANCHORS_FILE, nb_vps=NB_VPS) 48 | 49 | # every anchors /24 subnet 50 | target_addrs = [t["address_v4"] for t in targets] 51 | target_prefixes = get_target_prefixes(target_addrs) 52 | # responsive IP addresses in all /24 prefixes 53 | targets_per_prefix = load_json(USER_HITLIST_FILE) 54 | 55 | logger.info(f"Starting experiment with uuid :: {EXPERIMENT_UUID}") 56 | logger.info(f"Config output :: {CONFIG_PATH}") 57 | 58 | # check if measurements measurement under this config uuid already exists 59 | if CONFIG_PATH.exists(): 60 | logger.info(f"Loading existing measurement config:: {EXPERIMENT_UUID}") 61 | measurement_config = load_json(CONFIG_PATH) 62 | else: 63 | # create a new config is no existing one 64 | measurement_config = get_measurement_config( 65 | targets=targets, 66 | vps=vps, 67 | target_prefixes=target_prefixes, 68 | experiment_uuid=EXPERIMENT_UUID, 69 | target_measurement_uuid=TARGET_MESAUREMENT_UUID, 70 | prefix_measurement_uuid=PREFIX_MESAUREMENT_UUID, 71 | ) 72 | save_measurement_config(measurement_config, CONFIG_PATH) 73 | 74 | if do_target_pings: 75 | vps.extend(targets) 76 | 77 | logger.info(f"Starting targets pigns :: {TARGET_MESAUREMENT_UUID}") 78 | logger.info(f"Nb targets :: {len(targets)}") 79 | logger.info(f"Nb vps :: {len(vps)}") 80 | 81 | # measurement configuration for retrieval 82 | ping_targets( 83 | measurement_uuid=TARGET_MESAUREMENT_UUID, 84 | measurement_config=measurement_config, 85 | targets=targets, 86 | vps=vps, 87 | use_cache=True, 88 | ) 89 | 90 | # update config 91 | save_measurement_config(measurement_config, CONFIG_PATH) 92 | 93 | if do_target_prefix_pings: 94 | logger.info(f"Starting prefix pigns :: {PREFIX_MESAUREMENT_UUID}") 95 | logger.info(f"Nb targets :: {len(targets)}") 96 | logger.info(f"Nb prefixes :: {len(target_prefixes)}") 97 | logger.info(f"Nb vps :: {len(vps)}") 98 | 99 | ping_prefixes( 100 | vps=vps, 101 | target_prefixes=target_prefixes, 102 | targets_per_prefix=targets_per_prefix, 103 | measurement_uuid=PREFIX_MESAUREMENT_UUID, 104 | measurement_config=measurement_config, 105 | ) 106 | 107 | 108 | def main_retrieve_results() -> None: 109 | """retrieve all measurement results related with million scale""" 110 | retrieve_target_measurements = True 111 | retrieve_prefix_measurements = True 112 | 113 | measurement_config = load_json(CONFIG_PATH) 114 | logger.info(f"{measurement_config}") 115 | 116 | if retrieve_target_measurements: 117 | target_measurement_uuid = measurement_config["target_measurements"][ 118 | "measurement_uuid" 119 | ] 120 | 121 | logger.info( 122 | f"retrieving results for measurement ids: {target_measurement_uuid}" 123 | ) 124 | 125 | # sometimes, not all probes give output, reduce timeout if you do not want to wait for too long 126 | response = retrieve_results(TARGET_MESAUREMENT_UUID, TARGET_MEASUREMENT_RESULTS) 127 | 128 | # will output into user tables 129 | insert_target_results(response) 130 | 131 | if retrieve_prefix_measurements: 132 | prefix_measurement_uuid = measurement_config["prefix_measurements"][ 133 | "measurement_uuid" 134 | ] 135 | 136 | logger.info( 137 | f"retrieving results for measurement ids: {prefix_measurement_uuid}" 138 | ) 139 | 140 | # sometimes, not all probes give output, reduce timeout if you do not want to wait for too long 141 | response = retrieve_results(TARGET_MESAUREMENT_UUID, PREFIX_MEASUREMENT_RESULTS) 142 | 143 | # will output into user tables 144 | insert_prefix_results(response) 145 | 146 | 147 | if __name__ == "__main__": 148 | do_measurements = True 149 | do_retrieve_results = True 150 | 151 | if do_measurements: 152 | main_measurements() 153 | 154 | if do_retrieve_results: 155 | main_retrieve_results() 156 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "geoscale" 3 | version = "1.0" 4 | description = "Geolocation reproduction paper" 5 | authors = ["Danaelmilo "] 6 | readme = "README.md" 7 | packages = [ 8 | { include = "scripts" }, 9 | ] 10 | 11 | [tool.poetry.dependencies] 12 | python = "^3.9" 13 | numpy = "^1.21.3" 14 | matplotlib = "^3.4.3" 15 | requests = "^2.17.0" 16 | clickhouse-driver = "^0.2.6" 17 | overpy = "^0.6" 18 | dnspython = "^2.4.1" 19 | geopy = "^2.3.0" 20 | ujson = "^5.8.0" 21 | scipy = "^1.5.0" 22 | geopandas = "^0.13.2" 23 | rasterio = "^1.3.8" 24 | ipykernel = "^6.24.0" 25 | jupyter = "^1.0.0" 26 | py-radix = "^0.10.0" 27 | pyasn = "^1.6.2" 28 | clickhouse_driver = "^0.2.2" 29 | python-dotenv="^0.20.0" 30 | 31 | [tool.poetry.dev-dependencies] 32 | ipykernel = "^6.25.1" 33 | 34 | [build-system] 35 | requires = ["poetry-core"] 36 | build-backend = "poetry.core.masonry.api" -------------------------------------------------------------------------------- /scripts/ripe_atlas/atlas_api.py: -------------------------------------------------------------------------------- 1 | # All functions to query RIPE Atlas API 2 | 3 | import json 4 | import time 5 | import requests 6 | import ipaddress 7 | 8 | from collections import defaultdict, OrderedDict 9 | from ipaddress import IPv4Network 10 | from random import randint 11 | 12 | from logger import logger 13 | 14 | 15 | class RIPEAtlas(object): 16 | def __init__( 17 | self, 18 | account: str, 19 | key: str, 20 | ) -> None: 21 | self.account = account 22 | self.key = key 23 | 24 | def ping( 25 | self, target, vps, tag: str, nb_packets: int = 3, max_retry: int = 60 26 | ) -> None: 27 | """start ping measurement towards target from vps, return Atlas measurement id""" 28 | 29 | for _ in range(max_retry): 30 | response = requests.post( 31 | f"https://atlas.ripe.net/api/v2/measurements/?key={self.key}", 32 | json={ 33 | "definitions": [ 34 | { 35 | "target": target, 36 | "af": 4, 37 | "packets": nb_packets, 38 | "size": 48, 39 | "tags": [tag], 40 | "description": f"Dioptra Geolocation of {target}", 41 | "resolve_on_probe": False, 42 | "skip_dns_check": True, 43 | "include_probe_id": False, 44 | "type": "ping", 45 | } 46 | ], 47 | "probes": [ 48 | {"value": vp, "type": "probes", "requested": 1} for vp in vps 49 | ], 50 | "is_oneoff": True, 51 | "bill_to": self.account, 52 | }, 53 | ).json() 54 | 55 | try: 56 | measurement_id = response["measurements"][0] 57 | break 58 | except KeyError: 59 | logger.info(response) 60 | logger.warning("Too much measurements.", "Waiting.") 61 | time.sleep(60) 62 | else: 63 | raise Exception("Too much measurements. Stopping.") 64 | 65 | if not response: 66 | return 67 | 68 | try: 69 | return measurement_id 70 | except (IndexError, KeyError): 71 | return 72 | 73 | def traceroute_measurement(self, target, probes_selector, options): 74 | ripe_key, description, tags, is_public, packets, protocol = options 75 | 76 | core_parameters = { 77 | "target": target, 78 | "af": 4, 79 | "description": description, 80 | "resolve_on_probe": False, 81 | "type": "traceroute", 82 | "tags": tags, 83 | "is_public": is_public, 84 | } 85 | 86 | traceroute_parameters = { 87 | "packets": packets, 88 | "protocol": protocol, 89 | } 90 | 91 | parameters = {} 92 | parameters.update(core_parameters) 93 | parameters.update(traceroute_parameters) 94 | 95 | definitions = [parameters] 96 | 97 | response = requests.post( 98 | f"https://atlas.ripe.net/api/v2/measurements/?key={ripe_key}", 99 | json={ 100 | "definitions": definitions, 101 | "probes": [probes_selector], 102 | "is_oneoff": True, 103 | "bill_to": self.account, 104 | }, 105 | ).json() 106 | return response 107 | 108 | def __str__(self): 109 | return "RIPE Atlas" 110 | 111 | 112 | def ripe_traceroute_to_csv(traceroute): 113 | protocols = {"ICMP": 1, "TCP": 6, "UDP": 17} 114 | rows = [] 115 | try: 116 | src_addr = traceroute["from"] 117 | dst_addr = traceroute["dst_addr"] 118 | af = traceroute["af"] 119 | if af == 4: 120 | dst_prefix = ".".join(dst_addr.split(".")[:3] + ["0"]) 121 | elif af == 6: 122 | dst_prefix = str( 123 | ipaddress.ip_network(dst_addr + "/48", strict=False).network_address 124 | ) 125 | except (KeyError, ValueError): 126 | return rows 127 | 128 | for hop in traceroute["result"]: 129 | for response in hop.get("result", []): 130 | if not response or response.get("error"): 131 | continue 132 | if response.get("x") == "*" or not response.get("rtt"): 133 | response["from"] = "*" 134 | response["rtt"] = 0 135 | response["ttl"] = 0 136 | proto = protocols[traceroute["proto"]] 137 | try: 138 | row = ( 139 | src_addr, 140 | dst_prefix, 141 | dst_addr, 142 | response["from"], 143 | proto, 144 | hop["hop"], 145 | response["rtt"], 146 | response["ttl"], 147 | traceroute["prb_id"], 148 | traceroute["msm_id"], 149 | traceroute["timestamp"], 150 | ) 151 | row_str = "".join(f",{x}" for x in row)[1:] 152 | rows.append(row_str) 153 | except Exception: 154 | print("ERROR", response) 155 | 156 | return rows 157 | 158 | 159 | def fetch_traceroutes_from_measurement_ids_no_csv( 160 | measurement_ids, start=None, stop=None 161 | ): 162 | res = [] 163 | for measurement_id in measurement_ids: 164 | result_url = ( 165 | f"https://atlas.ripe.net/api/v2/measurements/{measurement_id}/results/?" 166 | ) 167 | if start: 168 | result_url += f"start={start}" 169 | if stop: 170 | result_url += f"&stop={stop}" 171 | traceroutes = requests.get(result_url).json() 172 | if "error" in traceroutes: 173 | print(traceroutes) 174 | continue 175 | for traceroute in traceroutes: 176 | rows = ripe_traceroute_to_csv(traceroute) 177 | for row in rows: 178 | res.append(row) 179 | return res 180 | 181 | 182 | def wait_for(measurement_id: str, max_retry: int = 30) -> None: 183 | for _ in range(max_retry): 184 | response = requests.get( 185 | f"https://atlas.ripe.net/api/v2/measurements/{measurement_id}/" 186 | ).json() 187 | 188 | # check if measurement is ongoing or not 189 | if response["status"]["name"] != "Ongoing": 190 | return response 191 | 192 | time.sleep(10) 193 | 194 | return None 195 | 196 | 197 | def get_prefix_from_ip(addr): 198 | """from an ip addr return /24 prefix""" 199 | prefix = addr.split(".")[:-1] 200 | prefix.append("0") 201 | prefix = ".".join(prefix) 202 | return prefix 203 | 204 | 205 | def get_target_hitlist(target_prefix, nb_targets, targets_per_prefix): 206 | """from ip, return a list of target ips""" 207 | target_addr_list = [] 208 | try: 209 | target_addr_list = targets_per_prefix[target_prefix] 210 | except KeyError: 211 | pass 212 | 213 | target_addr_list = list(set(target_addr_list)) 214 | 215 | if len(target_addr_list) < nb_targets: 216 | prefix = IPv4Network(target_prefix + "/24") 217 | target_addr_list.extend( 218 | [ 219 | str(prefix[randint(1, 254)]) 220 | for _ in range(0, nb_targets - len(target_addr_list)) 221 | ] 222 | ) 223 | 224 | if len(target_addr_list) > nb_targets: 225 | target_addr_list = target_addr_list[:nb_targets] 226 | 227 | return target_addr_list 228 | 229 | 230 | def is_geoloc_disputed(probe: dict) -> bool: 231 | """check if geoloc disputed flag is contained in probe metadata""" 232 | 233 | tags = probe["tags"] 234 | for tag in tags: 235 | if tag["slug"] == "system-geoloc-disputed": 236 | return True 237 | return False 238 | 239 | 240 | def get_measurement_url(measurement_id: int) -> str: 241 | """return Atlas API url for get measurement request""" 242 | 243 | return f"https://atlas.ripe.net/api/v2/measurements/{measurement_id}/results/" 244 | 245 | 246 | def get_response(url: str, max_retry: int = 60, wait_time: int = 2) -> list: 247 | """request to Atlas API""" 248 | 249 | for _ in range(max_retry): 250 | response = requests.get(url) 251 | 252 | # small parsing, as response might not be Json formatted 253 | try: 254 | response = json.loads(response.content) 255 | except json.JSONDecodeError: 256 | response = response.content.decode() 257 | response = response.replace("}{", "}, {") 258 | response = response.replace("} {", "}, {") 259 | response = json.loads(response) 260 | 261 | if response != []: 262 | break 263 | time.sleep(wait_time) 264 | 265 | return response 266 | 267 | 268 | def parse_measurements_results(response: list) -> dict: 269 | """from get Atlas measurement request return parsed results""" 270 | 271 | # parse response 272 | measurement_results = defaultdict(dict) 273 | for result in response: 274 | # parse results and calculate geoloc 275 | if result.get("result") is not None: 276 | dst_addr = result["dst_addr"] 277 | vp_addr = result["from"] 278 | 279 | if type(result["result"]) == list: 280 | rtt_list = [list(rtt.values())[0] for rtt in result["result"]] 281 | else: 282 | rtt_list = [result["result"]["rtt"]] 283 | 284 | # remove stars from results 285 | rtt_list = list(filter(lambda x: x != "*", rtt_list)) 286 | if not rtt_list: 287 | continue 288 | 289 | # sometimes connection error with vantage point cause result to be string message 290 | try: 291 | min_rtt = min(rtt_list) 292 | except TypeError: 293 | continue 294 | 295 | if isinstance(min_rtt, str): 296 | continue 297 | 298 | measurement_results[dst_addr][vp_addr] = { 299 | "node": vp_addr, 300 | "min_rtt": min_rtt, 301 | "rtt_list": rtt_list, 302 | } 303 | 304 | else: 305 | logger.warning(f"no results: {result}") 306 | 307 | # order vps per increasing rtt 308 | for dst_addr in measurement_results: 309 | measurement_results[dst_addr] = OrderedDict( 310 | { 311 | vp: results 312 | for vp, results in sorted( 313 | measurement_results[dst_addr].items(), 314 | key=lambda item: item[1]["min_rtt"], 315 | ) 316 | } 317 | ) 318 | 319 | return measurement_results 320 | 321 | 322 | def get_measurement_from_id( 323 | measurement_id: int, 324 | max_retry: int = 60, 325 | wait_time: int = 10, 326 | ) -> dict: 327 | """retrieve measurement results from RIPE Atlas with measurement id""" 328 | 329 | url = get_measurement_url(measurement_id) 330 | 331 | response = get_response(url, max_retry=max_retry, wait_time=wait_time) 332 | 333 | return response 334 | 335 | 336 | def get_measurements_from_tag(tag: str) -> dict: 337 | """retrieve all measurements that share the same tag and return parsed measurement results""" 338 | 339 | url = f"https://atlas.ripe.net/api/v2/measurements/tags/{tag}/results/" 340 | 341 | response = get_response(url, max_retry=1, wait_time=1) 342 | 343 | return response 344 | 345 | 346 | def get_from_atlas(url: str): 347 | """get request url atlas endpoint""" 348 | response = requests.get(url).json() 349 | while True: 350 | for anchor in response["results"]: 351 | yield anchor 352 | 353 | if response["next"]: 354 | response = requests.get(response["next"]).json() 355 | else: 356 | break 357 | 358 | 359 | def get_atlas_probes() -> list: 360 | """return all connected atlas probes""" 361 | probes = [] 362 | rejected = 0 363 | geoloc_disputed = 0 364 | for _, probe in enumerate(get_from_atlas("https://atlas.ripe.net/api/v2/probes/")): 365 | # filter probes based on generic criteria 366 | if not probe["is_anchor"]: 367 | if ( 368 | probe["status"]["name"] != "Connected" 369 | or probe.get("geometry") is None 370 | or probe.get("address_v4") is None 371 | or probe.get("country_code") is None 372 | ): 373 | rejected += 1 374 | continue 375 | 376 | if is_geoloc_disputed(probe): 377 | geoloc_disputed += 1 378 | continue 379 | 380 | reduced_probe = { 381 | "id": probe["id"], 382 | "address_v4": probe["address_v4"], 383 | "asn_v4": probe["asn_v4"], 384 | "country_code": probe["country_code"], 385 | "geometry": probe["geometry"], 386 | } 387 | probes.append(reduced_probe) 388 | 389 | return probes, rejected, geoloc_disputed 390 | 391 | 392 | def get_atlas_anchors() -> list: 393 | """return all atlas anchors""" 394 | anchors = [] 395 | rejected = 0 396 | geoloc_disputed = 0 397 | for _, anchor in enumerate(get_from_atlas("https://atlas.ripe.net/api/v2/probes/")): 398 | # filter anchors based on generic criteria 399 | if anchor["is_anchor"]: 400 | if ( 401 | anchor["status"]["name"] != "Connected" 402 | or anchor.get("geometry") is None 403 | or anchor.get("address_v4") is None 404 | or anchor.get("country_code") is None 405 | ): 406 | rejected += 1 407 | continue 408 | 409 | if is_geoloc_disputed(anchor): 410 | geoloc_disputed += 1 411 | continue 412 | 413 | reduced_anchor = { 414 | "id": anchor["id"], 415 | "address_v4": anchor["address_v4"], 416 | "asn_v4": anchor["asn_v4"], 417 | "country_code": anchor["country_code"], 418 | "geometry": anchor["geometry"], 419 | "id": anchor["id"], 420 | } 421 | anchors.append(reduced_anchor) 422 | 423 | return anchors, rejected, geoloc_disputed 424 | -------------------------------------------------------------------------------- /scripts/ripe_atlas/ping_and_traceroute_classes.py: -------------------------------------------------------------------------------- 1 | # Two classes to instantiate before calling RIPE Atlas API: one for ping measurements and one for traceroute measurements 2 | 3 | import time 4 | 5 | from pprint import pprint 6 | from copy import copy 7 | 8 | from logger import logger 9 | from scripts.ripe_atlas.atlas_api import RIPEAtlas, wait_for, get_target_hitlist 10 | from scripts.utils.credentials import get_ripe_atlas_credentials 11 | 12 | 13 | MAX_NUMBER_OF_VPS = 1_000 14 | NB_MAX_CONCURRENT_MEASUREMENTS = 90 15 | NB_PACKETS = 3 16 | NB_TARGETS_PER_PREFIX = 3 17 | 18 | 19 | class PING: 20 | def __init__( 21 | self, 22 | ) -> None: 23 | ripe_credentials = get_ripe_atlas_credentials() 24 | 25 | self.account = ripe_credentials["username"] 26 | self.key = ripe_credentials["secret_key"] 27 | 28 | self.driver = RIPEAtlas(self.account, self.key) 29 | 30 | def ping_by_prefix( 31 | self, 32 | target_prefixes: list, 33 | vps: dict, 34 | targets_per_prefix: dict, 35 | tag: str, 36 | nb_packets: int = NB_PACKETS, 37 | nb_targets: int = NB_TARGETS_PER_PREFIX, 38 | dry_run: bool = False, 39 | ): 40 | """from a list of prefixes, start measurements for n target addrs in prefix""" 41 | 42 | active_measurements = [] 43 | all_measurement_ids = [] 44 | start_time = time.time() 45 | for i, target_prefix in enumerate(target_prefixes): 46 | 47 | logger.info( 48 | f"Ping for target prefix:: {target_prefix}, {i+1}/{len(target_prefixes)}" 49 | ) 50 | 51 | # get target_addr_list 52 | target_addr_list = get_target_hitlist( 53 | target_prefix, nb_targets, targets_per_prefix 54 | ) 55 | 56 | # get vps id for measurement, remove target if in vps 57 | 58 | logger.debug( 59 | f"starting measurement for {target_prefix} with {[addr for addr in target_addr_list]}" 60 | ) 61 | 62 | for target_addr in target_addr_list: 63 | vp_ids = [vp["id"] for vp in vps if vp["address_v4"] != target_addr] 64 | for i in range(0, len(vp_ids), MAX_NUMBER_OF_VPS): 65 | subset_vp_ids = vp_ids[i : i + MAX_NUMBER_OF_VPS] 66 | 67 | logger.debug( 68 | f"starting measurement for {target_addr} with {len(subset_vp_ids)} vps" 69 | ) 70 | 71 | if not dry_run: 72 | measurement_id = self.driver.ping( 73 | str(target_addr), subset_vp_ids, str(tag), nb_packets 74 | ) 75 | 76 | logger.info( 77 | f"measurement tag: {tag} : started measurement id : {measurement_id}" 78 | ) 79 | else: 80 | measurement_id = 404 81 | 82 | active_measurements.append(measurement_id) 83 | all_measurement_ids.append(measurement_id) 84 | 85 | # check number of parallel measurements in not too high 86 | if len(active_measurements) >= NB_MAX_CONCURRENT_MEASUREMENTS: 87 | logger.info( 88 | f"Reached limit for number of concurrent measurements: {len(active_measurements)}" 89 | ) 90 | tmp_measurement_ids = copy(active_measurements) 91 | for id in tmp_measurement_ids: 92 | # wait for the last measurement of the batch to end before starting a new one 93 | if not dry_run: 94 | measurement_result = wait_for(id) 95 | if measurement_result: 96 | active_measurements.remove(id) 97 | else: 98 | active_measurements.remove(id) 99 | time.sleep(0.5) 100 | 101 | logger.info(f"measurement : {tag} done") 102 | 103 | end_time = time.time() 104 | 105 | return all_measurement_ids, start_time, end_time 106 | 107 | def ping_by_target( 108 | self, 109 | targets: list[dict], 110 | vps: list[dict], 111 | tag: str, 112 | nb_packets: int = NB_PACKETS, 113 | dry_run: bool = False, 114 | ): 115 | """from a list of prefixes, start measurements for n target addrs in prefix""" 116 | 117 | active_measurements = [] 118 | all_measurement_ids = [] 119 | start_time = time.time() 120 | for i, target_addr in enumerate(targets): 121 | logger.info(f"Ping for target:: {target_addr}, {i+1}/{len(targets)}") 122 | 123 | # get vps id for measurement, remove target if in vps 124 | vp_ids = [vp["id"] for vp in vps if vp["address_v4"] != target_addr] 125 | 126 | for i in range(0, len(vp_ids), MAX_NUMBER_OF_VPS): 127 | subset_vp_ids = vp_ids[i : i + MAX_NUMBER_OF_VPS] 128 | 129 | logger.debug( 130 | f"starting measurement for {target_addr} with {len(subset_vp_ids)} vps" 131 | ) 132 | 133 | if not dry_run: 134 | measurement_id = self.driver.ping( 135 | str(target_addr), subset_vp_ids, str(tag), nb_packets 136 | ) 137 | else: 138 | measurement_id = 404 139 | 140 | active_measurements.append(measurement_id) 141 | all_measurement_ids.append(measurement_id) 142 | 143 | logger.info( 144 | f"measurement tag: {tag} : started measurement id : {measurement_id}" 145 | ) 146 | 147 | # check number of parallel measurements in not too high 148 | if len(active_measurements) >= NB_MAX_CONCURRENT_MEASUREMENTS: 149 | logger.info( 150 | f"Reached limit for number of concurrent measurements: {len(active_measurements)}" 151 | ) 152 | tmp_measurement_ids = copy(active_measurements) 153 | for id in tmp_measurement_ids: 154 | # wait for the last measurement of the batch to end before starting a new one 155 | if not dry_run: 156 | measurement_result = wait_for(id) 157 | if measurement_result: 158 | active_measurements.remove(id) 159 | else: 160 | active_measurements.remove(id) 161 | time.sleep(0.5) 162 | 163 | logger.info(f"measurement : {tag} done") 164 | 165 | end_time = time.time() 166 | 167 | return all_measurement_ids, start_time, end_time 168 | 169 | 170 | class TRACEROUTE: 171 | def __init__( 172 | self, 173 | ) -> None: 174 | ripe_credentials = get_ripe_atlas_credentials() 175 | 176 | self.account = ripe_credentials["username"] 177 | self.key = ripe_credentials["secret_key"] 178 | self.driver = RIPEAtlas(self.account, self.key) 179 | 180 | def traceroute(self, target, probe_id): 181 | description = "Geoloc project" 182 | tags = ["traceroute", "test", "geoloc"] 183 | is_public = True 184 | probes = {"value": str(probe_id), "type": "probes", "requested": 1} 185 | packets = 3 186 | protocol = "ICMP" 187 | options = (self.key, description, tags, is_public, packets, protocol) 188 | 189 | response = self.driver.traceroute_measurement(target, probes, options) 190 | 191 | if "measurements" in response and len(response["measurements"]) == 1: 192 | return response["measurements"][0] 193 | else: 194 | print(f"Failed to traceroute") 195 | pprint(response) 196 | return None 197 | -------------------------------------------------------------------------------- /scripts/street_level/landmark.py: -------------------------------------------------------------------------------- 1 | # Do the landmark selection step as explained in the street level paper 2 | 3 | import requests 4 | import overpy 5 | import dns 6 | import dns.resolver 7 | import urllib3 8 | import pyasn 9 | import warnings 10 | 11 | from multiprocessing import Pool 12 | from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning 13 | from geopy import Point, distance 14 | 15 | from scripts.utils.file_utils import load_json, dump_json 16 | from default import CACHED_WEBSITES_FILE, IP_TO_ASN_FILE 17 | 18 | 19 | warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning) 20 | urllib3.disable_warnings() 21 | 22 | 23 | def get_bounding_box(lat, lon): 24 | p = Point(lat, lon) 25 | d = distance.distance(kilometers=2).meters 26 | top_right = distance.distance(meters=d).destination(p, 45) 27 | bottom_left = distance.distance(meters=d).destination(p, 225) 28 | return (bottom_left.latitude, bottom_left.longitude, top_right.latitude, top_right.longitude) 29 | 30 | 31 | def check_domain_name_ip(domain_name, ip_address, protocol): 32 | # print(f"Checking {domain_name}") 33 | ip_url = protocol + "://" + ip_address 34 | domain_url = protocol + "://" + domain_name 35 | try: 36 | ip_response = requests.get(ip_url, verify=False, timeout=1) 37 | if ip_response.status_code != 200: 38 | return False 39 | domain_response = requests.get(domain_url, timeout=1) 40 | if domain_response.status_code != 200: 41 | return False 42 | except Exception: 43 | # print(traceback.format_exc()) 44 | return False 45 | 46 | try: 47 | ip_soup = BeautifulSoup(ip_response.content, "html.parser") 48 | domain_soup = BeautifulSoup(domain_response.content, "html.parser") 49 | ip_title = ip_soup.head.title.text 50 | domain_title = domain_soup.head.title.text 51 | if ip_title == domain_title: 52 | return True 53 | else: 54 | return False 55 | except: 56 | return False 57 | 58 | 59 | def check_and_get_website_ip(website, protocol): 60 | asns = ['20940', '16625', '12222', '16625', '21342', '21399', '32787', '35994', '35993', '35995', '36408', '393234', '394689', 61 | '13335', '202018', '202109', '133293', '395747', 62 | '54113', '209242', 63 | '16509', '14618', '16509', '39111', '16509', 64 | '8075', '8075', '8075', '12076', '12222', 65 | '15169', '36351', '22577', '36040', '55023', 66 | '22822', 67 | '701', '22394, 11608, 11608', 68 | '3356', '133229, 133229, 395570', 69 | '60068', '136620', '395354', 70 | '32934'] 71 | res = {} 72 | asndb = pyasn.pyasn(str(IP_TO_ASN_FILE)) 73 | try: 74 | result = dns.resolver.resolve(website) 75 | except Exception: 76 | # print(traceback.format_exc()) 77 | return {'dns-failed': True} 78 | if len(result) == 0: 79 | return {'dns-failed': True} 80 | res = {'dns-failed': False} 81 | 82 | ip = result[0].to_text() 83 | res['ip'] = ip 84 | asn = asndb.lookup(ip)[0] 85 | if asn == None: 86 | res['asn-found'] = False 87 | return res 88 | else: 89 | res['asn-found'] = True 90 | if str(asn) in asns or 'google' in website or 'facebook' in website or 'amazon' in website or 'microsoft' in website or 'azure' in website or 'akamai' in website or 'cdn' in website: 91 | res['cdn'] = True 92 | return res 93 | else: 94 | res['cdn'] = False 95 | 96 | if check_domain_name_ip(website, ip, protocol): 97 | res['header-test'] = True 98 | return res 99 | else: 100 | res['header-test'] = False 101 | return res 102 | 103 | 104 | def get_one_website_ip(domain, protocol, lat, lon): 105 | ip_info = check_and_get_website_ip(domain, protocol) 106 | ip_info['domain'] = domain 107 | ip_info['protocol'] = protocol 108 | ip_info['lat'] = lat 109 | ip_info['lon'] = lon 110 | return ip_info 111 | 112 | 113 | def get_landmarks_with_website_from_lat_lon(lat_arg, lon_arg): 114 | # api = overpy.Overpass() 115 | # api = overpy.Overpass(url="https://overpass.kumi.systems/api/interpreter") 116 | api = overpy.Overpass( 117 | url="https://maps.mail.ru/osm/tools/overpass/api/interpreter") 118 | bbox = get_bounding_box(lat_arg, lon_arg) 119 | query = f""" 120 | [out:json]; 121 | ( 122 | node ({bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}) 123 | [website]; 124 | way ({bbox[0]},{bbox[1]},{bbox[2]},{bbox[3]}) 125 | [website]; 126 | ); 127 | out; 128 | """ 129 | result = api.query(query) 130 | res = [] 131 | for node in result.nodes: 132 | lat = float(node.lat) 133 | lon = float(node.lon) 134 | tags = node.tags 135 | website = tags['website'] 136 | res.append((website, lat, lon)) 137 | for way in result.ways: 138 | try: 139 | tmp_lat = 0 140 | tmp_lon = 0 141 | nodes = way.get_nodes(resolve_missing=True) 142 | for node in nodes: 143 | tmp_lat += float(node.lat) 144 | tmp_lon += float(node.lon) 145 | lat = tmp_lat/len(nodes) 146 | lon = tmp_lon/len(nodes) 147 | tags = way.tags 148 | website = tags['website'] 149 | res.append((website, lat, lon)) 150 | except: 151 | continue 152 | return res 153 | 154 | 155 | def get_all_landmarks_and_stats_from_points(points): 156 | dict_website = {} 157 | with Pool(8) as pool: 158 | results = pool.starmap(get_landmarks_with_website_from_lat_lon, points) 159 | for result in results: 160 | if result != None and result != []: 161 | for elem in result: 162 | dict_website[elem[0]] = elem 163 | 164 | unique_website = {} 165 | for url in dict_website: 166 | if "://" in url: 167 | protocol = url.split("://")[0] 168 | domain_name = url.split("://")[1] 169 | else: 170 | protocol = "http" 171 | domain_name = url 172 | website = domain_name.split("/")[0] 173 | if (website, protocol) not in unique_website: 174 | unique_website[(website, protocol)] = dict_website[url] 175 | 176 | args = [] 177 | failed_dns_count = 0 178 | failed_asn_count = 0 179 | cdn_count = 0 180 | failed_header_test_count = 0 181 | landmarks = [] 182 | 183 | try: 184 | all_websites = load_json(CACHED_WEBSITES_FILE) 185 | except FileNotFoundError: 186 | all_websites = {} 187 | 188 | for k, v in unique_website.items(): 189 | # fix websites 190 | if 'google' in k or 'facebook' in k or 'amazon' in k or 'microsoft' in k or 'azure' in k or 'akamai' in k or 'cdn' in k: 191 | all_websites[k]['cdn'] = True 192 | 193 | if k[0] not in all_websites: 194 | args.append((k[0], k[1], v[1], v[2])) 195 | else: 196 | result = all_websites[k[0]] 197 | if 'dns-failed' not in result or result['dns-failed']: 198 | failed_dns_count += 1 199 | continue 200 | if 'asn-found' not in result or not result['asn-found']: 201 | failed_asn_count += 1 202 | continue 203 | if 'cdn' not in result or result['cdn']: 204 | cdn_count += 1 205 | continue 206 | if 'header-test' not in result or not result['header-test']: 207 | failed_header_test_count += 1 208 | continue 209 | landmarks.append( 210 | (result['ip'], result['domain'], result['lat'], result['lon'])) 211 | 212 | with Pool() as pool: 213 | results = pool.starmap(get_one_website_ip, args) 214 | for result in results: 215 | all_websites[result['domain']] = result 216 | if 'dns-failed' not in result or result['dns-failed']: 217 | failed_dns_count += 1 218 | continue 219 | if 'asn-found' not in result or not result['asn-found']: 220 | failed_asn_count += 1 221 | continue 222 | if 'cdn' not in result or result['cdn']: 223 | cdn_count += 1 224 | continue 225 | if 'header-test' not in result or not result['header-test']: 226 | failed_header_test_count += 1 227 | continue 228 | landmarks.append( 229 | (result['ip'], result['domain'], result['lat'], result['lon'])) 230 | 231 | dump_json(all_websites, CACHED_WEBSITES_FILE) 232 | 233 | return failed_dns_count, failed_asn_count, cdn_count, failed_header_test_count, landmarks 234 | -------------------------------------------------------------------------------- /scripts/street_level/three_tiers.py: -------------------------------------------------------------------------------- 1 | # One function per tier of the street level method. 2 | 3 | import time 4 | 5 | from scripts.analysis.analysis import local_circle_preprocessing 6 | from scripts.street_level.landmark import get_all_landmarks_and_stats_from_points 7 | from scripts.utils.helpers import get_center_of_poly, get_points_in_poly 8 | from scripts.street_level.traceroutes_results import ( 9 | get_circles_to_target, 10 | start_and_get_traceroutes, 11 | ) 12 | 13 | 14 | def tier_1(target_ip, res, vps=None): 15 | st = time.time() 16 | # Get all circles (from each VP to the target) 17 | all_circles = get_circles_to_target(target_ip, vps) 18 | 19 | # Try the recommended internet speed at first 20 | speed_threshold = 4 / 9 21 | imp_circles = local_circle_preprocessing( 22 | all_circles, speed_threshold=speed_threshold 23 | ) 24 | lat, lon = get_center_of_poly(imp_circles, speed_threshold) 25 | 26 | # If there is no intersection polygone try a slower interent speed 27 | if lat == None or lon == None: 28 | speed_threshold = 2 / 3 29 | imp_circles = local_circle_preprocessing( 30 | all_circles, speed_threshold=speed_threshold 31 | ) 32 | lat, lon = get_center_of_poly(imp_circles, speed_threshold) 33 | res["speed_threshold"] = speed_threshold 34 | res["tier1:lat"] = lat 35 | res["tier1:lon"] = lon 36 | res["vps"] = imp_circles 37 | et = time.time() 38 | # Saving the time needed to perform this step 39 | res["tier1:duration"] = et - st 40 | return res 41 | 42 | 43 | def tier_2(target_ip, res, vps=None): 44 | st = time.time() 45 | tier2_points = get_points_in_poly(res["vps"], 36, 5, res["speed_threshold"]) 46 | res["tier2:all_points_count"] = len(tier2_points) 47 | 48 | # We remove points further than 1000km from the estimated center of the polygone (in case the intersection area is too big) 49 | tier2_points = tier2_points[: 200 * 10 + 1] 50 | res["tier2:inspected_points_count"] = len(tier2_points) 51 | if len(tier2_points) == 0: 52 | res["tier2:lat"] = None 53 | res["tier2:lon"] = None 54 | et = time.time() 55 | res["tier2:duration"] = et - st 56 | return res 57 | 58 | ( 59 | failed_dns_count, 60 | failed_asn_count, 61 | cdn_count, 62 | failed_header_test_count, 63 | landmarks, 64 | ) = get_all_landmarks_and_stats_from_points(tier2_points) 65 | # We save stats for possiblity of a website to be used as a landmark 66 | res["tier2:failed_dns_count"] = failed_dns_count 67 | res["tier2:failed_asn_count"] = failed_asn_count 68 | res["tier2:cdn_count"] = cdn_count 69 | res["tier2:non_cdn_count"] = len(landmarks) + failed_header_test_count 70 | res["tier2:landmark_count"] = len(landmarks) 71 | res["tier2:failed_header_test_count"] = failed_header_test_count 72 | res["tier2:landmarks"] = landmarks 73 | 74 | if len(res["tier2:landmarks"]) == 0: 75 | res["tier2:lat"] = None 76 | res["tier2:lon"] = None 77 | et = time.time() 78 | res["tier2:duration"] = et - st 79 | return res 80 | 81 | res["tier2:traceroutes"] = start_and_get_traceroutes( 82 | target_ip, res["vps"], res["tier2:landmarks"], vps 83 | ) 84 | all_circles = [] 85 | best_rtt = 5000 86 | res_lat = None 87 | res_lon = None 88 | for probe_ip, target_ip, landmark_ip, r1ip, rtt, lat, lon, traceroute_id in res[ 89 | "tier2:traceroutes" 90 | ]: 91 | if rtt < 0: 92 | continue 93 | all_circles.append((lat, lon, rtt, None, None)) 94 | if rtt < best_rtt: 95 | best_rtt = rtt 96 | res_lat = lat 97 | res_lon = lon 98 | 99 | # If there is no valid RTT then tier 2 has failed and we can not go further 100 | if len(all_circles) == 0: 101 | res["tier2:lat"] = None 102 | res["tier2:lon"] = None 103 | et = time.time() 104 | res["tier2:duration"] = et - st 105 | return res 106 | 107 | # If not, we use the smallest rtt landmark as 108 | res["tier2:lat"] = res_lat 109 | res["tier2:lon"] = res_lon 110 | res["tier2:final_circles"] = all_circles 111 | et = time.time() 112 | res["tier2:duration"] = et - st 113 | return res 114 | 115 | 116 | def tier_3(target_ip, res, vps=None): 117 | st = time.time() 118 | if "tier2:final_circles" not in res: 119 | res["tier3:lat"] = None 120 | res["tier3:lon"] = None 121 | et = time.time() 122 | res["tier3:duration"] = et - st 123 | return res 124 | 125 | else: 126 | all_circles = res["tier2:final_circles"] 127 | 128 | imp_circles = local_circle_preprocessing( 129 | all_circles, speed_threshold=res["speed_threshold"] 130 | ) 131 | tier3_points = get_points_in_poly( 132 | imp_circles, 10, 1, res["speed_threshold"], res["vps"] 133 | ) 134 | res["tier3:all_points_count"] = len(tier3_points) 135 | 136 | # We remove points/zipcodes further then 40Km away from the center of the polygone 137 | tier3_points = tier3_points[: 40 * 36 + 1] 138 | res["tier3:inspected_points_count"] = len(tier3_points) 139 | if len(tier3_points) == 0: 140 | res["tier3:lat"] = None 141 | res["tier3:lon"] = None 142 | et = time.time() 143 | res["tier3:duration"] = et - st 144 | return res 145 | 146 | ( 147 | failed_dns_count, 148 | failed_asn_count, 149 | cdn_count, 150 | failed_header_test_count, 151 | tmp_landmarks, 152 | ) = get_all_landmarks_and_stats_from_points(tier3_points) 153 | landmarks = [] 154 | for landmark in tmp_landmarks: 155 | ip = landmark[0] 156 | found = False 157 | for t2_lm in res["tier2:landmarks"]: 158 | if t2_lm[0] == ip: 159 | found = True 160 | break 161 | if not found: 162 | landmarks.append(landmark) 163 | 164 | res["tier3:failed_dns_count"] = failed_dns_count 165 | res["tier3:failed_asn_count"] = failed_asn_count 166 | res["tier3:cdn_count"] = cdn_count 167 | res["tier3:non_cdn_count"] = len(landmarks) + failed_header_test_count 168 | res["tier3:landmark_count"] = len(landmarks) 169 | res["tier3:failed_header_test_count"] = failed_header_test_count 170 | res["tier3:landmarks"] = landmarks 171 | 172 | if len(res["tier3:landmarks"]) == 0: 173 | res["tier3:lat"] = None 174 | res["tier3:lon"] = None 175 | et = time.time() 176 | res["tier3:duration"] = et - st 177 | return res 178 | 179 | res["tier3:traceroutes"] = start_and_get_traceroutes( 180 | target_ip, res["vps"], res["tier3:landmarks"], vps 181 | ) 182 | 183 | best_lon = None 184 | best_lat = None 185 | best_rtt = 5000 186 | for probe_ip, target_ip, landmark_ip, r1ip, rtt, lat, lon, traceroute_id in res[ 187 | "tier2:traceroutes" 188 | ]: 189 | if rtt < 0: 190 | continue 191 | if rtt < best_rtt: 192 | best_rtt = rtt 193 | best_lon = lon 194 | best_lat = lat 195 | for probe_ip, target_ip, landmark_ip, r1ip, rtt, lat, lon, traceroute_id in res[ 196 | "tier3:traceroutes" 197 | ]: 198 | if rtt < 0: 199 | continue 200 | if rtt < best_rtt: 201 | best_rtt = rtt 202 | best_lon = lon 203 | best_lat = lat 204 | 205 | res["tier3:lat"] = best_lat 206 | res["tier3:lon"] = best_lon 207 | et = time.time() 208 | res["tier3:duration"] = et - st 209 | return res 210 | 211 | 212 | def get_all_info_geoloc(target_ip, vps=None): 213 | # Init results 214 | res = { 215 | "target_ip": target_ip, 216 | "tier1:done": False, 217 | "tier2:done": False, 218 | "tier3:done": False, 219 | "negative_rtt_included": True, 220 | } 221 | res = tier_1(target_ip, res, vps=vps) 222 | 223 | # Using tier 1(CBG) results as geolocation if the other steps fail 224 | res["lat"] = res["tier1:lat"] 225 | res["lon"] = res["tier1:lon"] 226 | if res["tier1:lat"] == None or res["tier1:lon"] == None: 227 | return res 228 | res["tier1:done"] = True 229 | 230 | res = tier_2(target_ip, res, vps=vps) 231 | 232 | # Using tier 2 resultsas geolocation if the last step fails 233 | if res["tier2:lat"] == None or res["tier2:lon"] == None: 234 | return res 235 | else: 236 | res["tier2:done"] = True 237 | res["lat"] = res["tier2:lat"] 238 | res["lon"] = res["tier2:lon"] 239 | 240 | res = tier_3(target_ip, res, vps=vps) 241 | 242 | if res["tier3:lat"] != None and res["tier3:lon"] != None: 243 | res["tier3:done"] = True 244 | res["lat"] = res["tier3:lat"] 245 | res["lon"] = res["tier3:lon"] 246 | 247 | return res 248 | 249 | 250 | def geoloc(target_ip): 251 | """ 252 | This function return a dict containint the lat, lon coordinates of the given target_ip. 253 | The target_ip should be traceroutable. 254 | The function gives a less informative gelocation result than get_all_info_geoloc 255 | """ 256 | all_info = get_all_info_geoloc(target_ip) 257 | return {"lat": all_info["lat"], "lon": all_info["lon"]} 258 | -------------------------------------------------------------------------------- /scripts/street_level/traceroutes_results.py: -------------------------------------------------------------------------------- 1 | """Intermediate functions during street level traceroutes process""" 2 | 3 | import time 4 | 5 | from scripts.utils.clickhouse import Clickhouse 6 | from scripts.utils.file_utils import load_json 7 | from scripts.ripe_atlas.ping_and_traceroute_classes import TRACEROUTE 8 | from scripts.ripe_atlas.atlas_api import fetch_traceroutes_from_measurement_ids_no_csv 9 | from default import USER_ANCHORS_FILE, STREET_LEVEL_TRACEROUTES_TABLE 10 | 11 | 12 | def start_traceroutes_to_targets(targets, probes): 13 | results_to_get = [] 14 | for target in targets: 15 | target_ip = target[0] 16 | for probe in probes: 17 | probe_ip = probe["address_v4"] 18 | probe_id = str(probe["id"]) 19 | trace = TRACEROUTE() 20 | res = trace.traceroute(target_ip, probe_id) 21 | if res != None: 22 | results_to_get.append((res, probe_ip, target_ip)) 23 | return results_to_get 24 | 25 | 26 | def get_traceroutes_results(traceroute_ids): 27 | next_to_do = [] 28 | for id in traceroute_ids: 29 | next_to_do.append(id) 30 | nb_tries = 20 31 | while nb_tries > 0 and len(next_to_do) > 0: 32 | nb_tries -= 1 33 | to_do = [] 34 | for id in next_to_do: 35 | to_do.append(id) 36 | 37 | next_to_do = [] 38 | 39 | for id in to_do: 40 | try: 41 | ids = [id] 42 | traceroute_data = fetch_traceroutes_from_measurement_ids_no_csv(ids) 43 | if len(traceroute_data) == 0: 44 | next_to_do.append(id) 45 | else: 46 | insert_lst = [] 47 | for t in traceroute_data: 48 | ts = t.split(",") 49 | insert_lst.append( 50 | ( 51 | ts[0], 52 | ts[1], 53 | ts[2], 54 | ts[3], 55 | int(ts[4]), 56 | int(ts[5]), 57 | float(ts[6]), 58 | int(ts[7]), 59 | int(ts[8]), 60 | int(ts[9]), 61 | int(ts[10]), 62 | ) 63 | ) 64 | # We insert traceroute data into the database to be used later 65 | clickhouse_driver = Clickhouse() 66 | query = clickhouse_driver.insert_street_lvl_traceroutes_query(STREET_LEVEL_TRACEROUTES_TABLE) 67 | clickhouse_driver.execute(query, insert_lst) 68 | except Exception: 69 | next_to_do.append(id) 70 | if len(next_to_do) > 0: 71 | # We wait to try again 72 | time.sleep(15) 73 | 74 | 75 | """ 76 | Function starts and fetches traceroute from all probes to all targets 77 | """ 78 | 79 | 80 | def multi_traceroutes(targets, probes): 81 | tmp_res_traceroutes = start_traceroutes_to_targets(targets, probes) 82 | traceroute_ids = [] 83 | for elem in tmp_res_traceroutes: 84 | traceroute_ids.append(elem[0]) 85 | 86 | get_traceroutes_results(traceroute_ids) 87 | return tmp_res_traceroutes 88 | 89 | 90 | def tier_1_performe_traceroutes(target_ip, vps=None): 91 | # Traceroute from every VP to the target 92 | if vps == None: 93 | probes = load_json(USER_ANCHORS_FILE) 94 | else: 95 | probes = vps 96 | multi_traceroutes([[target_ip]], probes) 97 | 98 | 99 | def get_circles_to_target(target_ip, vps=None): 100 | # Get Rtts from all VPs to the targets if traceroutes are already done 101 | clickhouse_driver = Clickhouse() 102 | query = clickhouse_driver.get_all_rtt_to_dst_address_query(STREET_LEVEL_TRACEROUTES_TABLE, target_ip) 103 | res = clickhouse_driver.execute(query) 104 | # If None we need to lunch traceroutes from every VP to the target 105 | if len(res) == 0: 106 | tier_1_performe_traceroutes(target_ip, vps) 107 | res = clickhouse_driver.execute(query) 108 | if len(res) == 0: 109 | return [] 110 | 111 | # Calculate per VP min RTT 112 | dict_rtt = {} 113 | for hop in res: 114 | if hop[0] not in dict_rtt: 115 | dict_rtt[hop[0]] = (hop[1], hop[2]) 116 | if hop[2] > dict_rtt[hop[0]][1]: 117 | dict_rtt[hop[0]] = (hop[1], hop[2]) 118 | if hop[2] == dict_rtt[hop[0]][1] and hop[1] < dict_rtt[hop[0]][0]: 119 | dict_rtt[hop[0]] = (hop[1], hop[2]) 120 | 121 | # From IPs get Geolocation given by RIPE Atlas 122 | if vps == None: 123 | probes_data = load_json(USER_ANCHORS_FILE) 124 | else: 125 | probes_data = vps 126 | dict_probe_info = {} 127 | for probe in probes_data: 128 | if probe["address_v4"] == target_ip: 129 | continue 130 | if "address_v4" not in probe or probe["address_v4"] not in dict_rtt: 131 | continue 132 | if ( 133 | "geometry" not in probe 134 | or "type" not in probe["geometry"] 135 | or probe["geometry"]["type"] != "Point" 136 | or "coordinates" not in probe["geometry"] 137 | ): 138 | continue 139 | lon, lat = probe["geometry"]["coordinates"] 140 | dict_probe_info[probe["address_v4"]] = ( 141 | lat, 142 | lon, 143 | dict_rtt[probe["address_v4"]][0], 144 | None, 145 | None, 146 | ) 147 | 148 | # Return a list of items 149 | # each Item is a VP (lat, lon, min_rtt, dist = None, dist_r = None) 150 | res = [] 151 | for k, v in dict_probe_info.items(): 152 | res.append(v) 153 | return res 154 | 155 | 156 | def get_rtt_diff(probe_ip, target_ip, landmark_ip): 157 | clickhouse_driver = Clickhouse() 158 | query = clickhouse_driver.get_all_rtt_from_probe_to_targets_query(STREET_LEVEL_TRACEROUTES_TABLE, probe_ip, target_ip, landmark_ip) 159 | res = clickhouse_driver.execute(query) 160 | rtt_dict_target = {} 161 | rtt_dict_landmark = {} 162 | 163 | for l in res: 164 | resp_ip = l[0] 165 | dst_ip = l[1] 166 | rtt = l[2] 167 | if dst_ip == target_ip: 168 | if resp_ip not in rtt_dict_target: 169 | rtt_dict_target[resp_ip] = rtt 170 | if rtt < rtt_dict_target[resp_ip]: 171 | rtt_dict_target[resp_ip] = rtt 172 | elif dst_ip == landmark_ip: 173 | if resp_ip not in rtt_dict_landmark: 174 | rtt_dict_landmark[resp_ip] = rtt 175 | if rtt < rtt_dict_landmark[resp_ip]: 176 | rtt_dict_landmark[resp_ip] = rtt 177 | if target_ip not in rtt_dict_target or landmark_ip not in rtt_dict_landmark: 178 | return -1, None 179 | target_rtt = rtt_dict_target[target_ip] 180 | landmark_rtt = rtt_dict_landmark[landmark_ip] 181 | same_dict = {} 182 | for ip in rtt_dict_target: 183 | if ip in rtt_dict_landmark: 184 | same_dict[ip] = min(rtt_dict_landmark[ip], rtt_dict_target[ip]) 185 | best_rtt = 0 186 | best_ip = None 187 | for k, v in same_dict.items(): 188 | if v > best_rtt: 189 | best_rtt = v 190 | best_ip = k 191 | return target_rtt + landmark_rtt - best_rtt - best_rtt, best_ip 192 | 193 | 194 | def get_probes_to_use_for_circles(circles, vps=None): 195 | if vps == None: 196 | probes_data = load_json(USER_ANCHORS_FILE) 197 | else: 198 | probes_data = vps 199 | lats_lons = {} 200 | for circle in circles: 201 | lats_lons[(circle[0], circle[1])] = circle 202 | res = [] 203 | for probe in probes_data: 204 | if ( 205 | "geometry" not in probe 206 | or "type" not in probe["geometry"] 207 | or probe["geometry"]["type"] != "Point" 208 | or "coordinates" not in probe["geometry"] 209 | ): 210 | continue 211 | lon, lat = probe["geometry"]["coordinates"] 212 | if (lat, lon) in lats_lons: 213 | res.append(probe) 214 | return res 215 | 216 | 217 | def start_and_get_traceroutes(target_ip, used_vps, landmarks, all_vps): 218 | probes = get_probes_to_use_for_circles(used_vps, all_vps) 219 | tmp_res_traceroutes = multi_traceroutes(landmarks, probes) 220 | 221 | # For each traceroute to a landmark we try to get the last common router/IP (r1ip) and the distance d1 + d2 (rtt) 222 | res = [] 223 | for t in tmp_res_traceroutes: 224 | traceroute_id = t[0] 225 | probe_ip = t[1] 226 | landmark_ip = t[2] 227 | rtt, r1ip = get_rtt_diff(probe_ip, target_ip, landmark_ip) 228 | for landmark in landmarks: 229 | if landmark[0] == landmark_ip: 230 | res.append( 231 | ( 232 | probe_ip, 233 | target_ip, 234 | landmark_ip, 235 | r1ip, 236 | rtt, 237 | landmark[2], 238 | landmark[3], 239 | traceroute_id, 240 | ) 241 | ) 242 | break 243 | return res 244 | 245 | 246 | def serialize(res1): 247 | res = {} 248 | for k, v in res1.items(): 249 | res[k] = v 250 | if "vps" in res: 251 | tmp_lst = [] 252 | for x in res["vps"]: 253 | tmp_lst.append(list(x)) 254 | res["vps"] = tmp_lst 255 | if "tier2:landmarks" in res: 256 | tmp_lst = [] 257 | for x in res["tier2:landmarks"]: 258 | tmp_lst.append(list(x)) 259 | res["tier2:landmarks"] = tmp_lst 260 | if "tier2:traceroutes" in res: 261 | tmp_lst = [] 262 | for x in res["tier2:traceroutes"]: 263 | tmp_lst.append(list(x)) 264 | res["tier2:traceroutes"] = tmp_lst 265 | if "tier3:landmarks" in res: 266 | tmp_lst = [] 267 | for x in res["tier3:landmarks"]: 268 | tmp_lst.append(list(x)) 269 | res["tier3:landmarks"] = tmp_lst 270 | if "tier3:traceroutes" in res: 271 | tmp_lst = [] 272 | for x in res["tier3:traceroutes"]: 273 | tmp_lst.append(list(x)) 274 | res["tier3:traceroutes"] = tmp_lst 275 | return res 276 | -------------------------------------------------------------------------------- /scripts/utils/clickhouse.py: -------------------------------------------------------------------------------- 1 | """clickhouse client""" 2 | 3 | import subprocess 4 | 5 | from pathlib import Path 6 | from clickhouse_driver import Client 7 | 8 | from logger import logger 9 | from default import ( 10 | CLICKHOUSE_HOST, 11 | CLICKHOUSE_DB, 12 | CLICKHOUSE_USER, 13 | CLICKHOUSE_PASSWORD, 14 | CLICKHOUSE_CLIENT, 15 | ) 16 | 17 | 18 | class Clickhouse: 19 | def __init__( 20 | self, 21 | host: str = CLICKHOUSE_HOST, 22 | database: str = CLICKHOUSE_DB, 23 | user: str = CLICKHOUSE_USER, 24 | password: str = CLICKHOUSE_PASSWORD, 25 | client_path: Path = CLICKHOUSE_CLIENT, 26 | ) -> None: 27 | self.host = host 28 | self.database = database 29 | self.user = user 30 | self.password = password 31 | self.client_path = client_path 32 | 33 | self.client: Client = Client( 34 | host=self.host, user=self.user, password=self.password 35 | ) 36 | 37 | self.settings = {"max_block_size": 100000} 38 | 39 | def get_min_rtt_per_src_dst_query( 40 | self, table: str, filter: str, threshold=10000 41 | ) -> str: 42 | return f""" 43 | WITH arrayMin(groupArray(`min`)) as min_rtt 44 | SELECT IPv4NumToString(dst), IPv4NumToString(src), min_rtt 45 | FROM {self.database}.{table} 46 | WHERE `min` > -1 AND `min`< {threshold} AND dst != src {filter} 47 | GROUP BY (dst, src) 48 | """ 49 | 50 | def get_min_rtt_per_src_dst_prefix_query( 51 | self, table: str, filter: str, threshold=10000 52 | ) -> str: 53 | return f""" 54 | WITH arrayMin(groupArray(`min`)) as min_rtt 55 | SELECT IPv4NumToString(dst_prefix), IPv4NumToString(src), min_rtt 56 | FROM {self.database}.{table} 57 | WHERE `min` > -1 AND `min`< {threshold} 58 | AND dst_prefix != toIPv4(substring(cutIPv6(IPv4ToIPv6(src), 0, 1), 8)) 59 | {filter} 60 | GROUP BY dst_prefix, src 61 | """ 62 | 63 | def get_all_rtt_to_dst_address_query(self, table: str, target: str) -> str: 64 | return f""" 65 | SELECT src_addr, rtt, tstamp 66 | FROM {self.database}.{table} 67 | WHERE resp_addr = '{target}' AND dst_addr = '{target}' 68 | """ 69 | 70 | def get_all_rtt_from_probe_to_targets_query( 71 | self, table: str, src: str, target1: str, target2: str 72 | ) -> str: 73 | return f""" 74 | SELECT resp_addr, dst_addr, rtt 75 | FROM {self.database}.{table} 76 | WHERE src_addr = '{src}' and (dst_addr = '{target1}' or dst_addr = '{target2}') 77 | """ 78 | 79 | def insert_street_lvl_traceroutes_query(self, table: str) -> str: 80 | return f""" 81 | INSERT 82 | INTO {self.database}.{table} ( 83 | src_addr, dst_prefix, dst_addr, resp_addr, 84 | proto, hop, rtt, ttl, prb_id, msm_id, tstamp 85 | ) VALUES 86 | """ 87 | 88 | def insert_native_query(self, table: str, infile_path: Path) -> str: 89 | """insert data using local clickhouse file""" 90 | return f""" 91 | INSERT INTO {self.database}.{table} 92 | FROM INFILE '{str(infile_path)}' 93 | FORMAT Native""" 94 | 95 | def insert_csv_query(self, table: str, infile_path: Path) -> str: 96 | """insert data from csv file""" 97 | return f""" 98 | INSERT INTO {self.database}.{table} 99 | FROM INFILE '{str(infile_path)}' 100 | FORMAT CSV 101 | """ 102 | 103 | def insert_file(self, query: str) -> None: 104 | """execute clickhouse insert query as not supported by clickhouse-driver""" 105 | cmd = f"{str(self.client_path)} client" 106 | 107 | if self.password is not None and self.password != "": 108 | cmd += f"--password={self.password}" 109 | cmd += f' --query="{query}"' 110 | 111 | logger.info(f"executing query: {cmd}") 112 | 113 | ps = subprocess.run(cmd, shell=True, capture_output=True, text=True) 114 | 115 | if ps.stderr: 116 | raise RuntimeError( 117 | f"Could not insert data::{cmd}, failed with error: {ps.stderr}" 118 | ) 119 | else: 120 | logger.info(f"{cmd}::Successfully executed") 121 | 122 | def execute(self, query: str, arg_lst=[]) -> None: 123 | """execute query using clickhouse driver""" 124 | if arg_lst == []: 125 | return self.client.execute(query, settings=self.settings) 126 | else: 127 | return self.client.execute(query, arg_lst, settings=self.settings) 128 | 129 | def insert_from_values_query(self, table: str, values_description: str) -> str: 130 | """insert data from csv file""" 131 | return f""" 132 | INSERT INTO {self.database}.{table} 133 | ({values_description}) 134 | VALUES 135 | """ 136 | 137 | def insert_from_values(self, query: str, data: list) -> None: 138 | return self.client.execute(query, data, settings=self.settings) 139 | 140 | def execute_iter(self, query: str) -> None: 141 | """use clickhouse driver instead of subprocess""" 142 | return self.client.execute_iter(query, settings=self.settings) 143 | 144 | def create_prefixes_ping_tables(self, table_name: str) -> str: 145 | """create all ping tables""" 146 | return f""" 147 | CREATE TABLE IF NOT EXISTS {self.database}.{table_name} 148 | ( 149 | `src` IPv4, 150 | `dst` IPv4, 151 | `dst_prefix` IPv4 MATERIALIZED toIPv4(substring(cutIPv6(IPv4ToIPv6(dst), 0, 1), 8)), 152 | `prb_id` UInt32, 153 | `date` DateTime, 154 | `sent` UInt32, 155 | `rcvd` UInt32, 156 | `rtts` Array(Float64), 157 | `min` Float64, 158 | `mean` Float64, 159 | `msm_id` UInt64, 160 | `proto` UInt8 161 | ) 162 | ENGINE=MergeTree() 163 | ORDER BY (dst_prefix, dst, src, msm_id, date) 164 | """ 165 | 166 | def create_target_ping_tables(self, table_name: str) -> str: 167 | """create table""" 168 | return f""" 169 | CREATE TABLE IF NOT EXISTS {self.database}.{table_name} 170 | ( 171 | `src` IPv4, 172 | `dst` IPv4, 173 | `prb_id` UInt32, 174 | `date` DateTime, 175 | `sent` UInt32, 176 | `rcvd` UInt32, 177 | `rtts` Array(Float64), 178 | `min` Float64, 179 | `mean` Float64, 180 | `msm_id` UInt64, 181 | `proto` UInt8 182 | ) 183 | ENGINE=MergeTree() 184 | ORDER BY (dst, src, msm_id, date) 185 | """ 186 | 187 | def create_traceroutes_table(self, table_name: str) -> str: 188 | return f""" 189 | CREATE TABLE IF NOT EXISTS {self.database}.{table_name} 190 | ( 191 | `src_ip` String, 192 | `dst_prefix` String, 193 | `dst_ip` String, 194 | `reply_ip` String, 195 | `proto` Int16, 196 | `hop` Int16, 197 | `rtt` Float64, 198 | `ttl` Int16, 199 | `prb_id` Int64, 200 | `msm_id` Int64, 201 | `timestamp` DateTime('UTC') 202 | ) 203 | ENGINE=MergeTree() 204 | ORDER BY (dst_prefix, dst_ip, src_ip, reply_ip) 205 | """ 206 | 207 | def create_street_level_table(self, table_name: str) -> str: 208 | """create the street level traceroute table""" 209 | 210 | return f""" 211 | CREATE TABLE IF NOT EXISTS {self.database}.{table_name} 212 | ( 213 | `src_addr` String, 214 | `dst_prefix` String, 215 | `dst_addr` String, 216 | `resp_addr` String, 217 | `proto` Int16, 218 | `hop` Int16, 219 | `rtt` Float64, 220 | `ttl` Int16, 221 | `prb_id` Int64, 222 | `msm_id` Int64, 223 | `tstamp` Datetime('UTC') 224 | ) 225 | ENGINE = MergeTree() 226 | ORDER BY (dst_addr, src_addr, tstamp) 227 | """ 228 | -------------------------------------------------------------------------------- /scripts/utils/clickhouse_installer.py: -------------------------------------------------------------------------------- 1 | """clickhouse client""" 2 | 3 | from scripts.utils.clickhouse import Clickhouse 4 | from logger import logger 5 | 6 | from default import * 7 | 8 | 9 | if __name__ == "__main__": 10 | clickhouse_driver = Clickhouse() 11 | 12 | ################################################################################################## 13 | # CREATE REPRO TABLES # 14 | ################################################################################################## 15 | 16 | # create anchors_meshed_table 17 | query = clickhouse_driver.create_target_ping_tables(ANCHORS_MESHED_PING_TABLE) 18 | clickhouse_driver.execute(query) 19 | logger.info(f"table {ANCHORS_MESHED_PING_TABLE} created") 20 | 21 | query = clickhouse_driver.create_target_ping_tables(PROBES_TO_ANCHORS_PING_TABLE) 22 | clickhouse_driver.execute(query) 23 | logger.info(f"table {PROBES_TO_ANCHORS_PING_TABLE} created") 24 | 25 | # create prefixes ping table 26 | query = clickhouse_driver.create_prefixes_ping_tables(ANCHORS_TO_PREFIX_TABLE) 27 | clickhouse_driver.execute(query) 28 | logger.info(f"table {ANCHORS_TO_PREFIX_TABLE} created") 29 | 30 | query = clickhouse_driver.create_prefixes_ping_tables(PROBES_TO_PREFIX_TABLE) 31 | clickhouse_driver.execute(query) 32 | logger.info(f"table {PROBES_TO_PREFIX_TABLE} created") 33 | 34 | query = clickhouse_driver.create_prefixes_ping_tables( 35 | TARGET_TO_LANDMARKS_PING_TABLE 36 | ) 37 | clickhouse_driver.execute(query) 38 | logger.info(f"table {TARGET_TO_LANDMARKS_PING_TABLE} created") 39 | 40 | # create traceroute table 41 | query = clickhouse_driver.create_traceroutes_table(ANCHORS_MESHED_TRACEROUTE_TABLE) 42 | clickhouse_driver.execute(query) 43 | logger.info(f"table {ANCHORS_MESHED_TRACEROUTE_TABLE} created") 44 | 45 | # Create street level db 46 | query = clickhouse_driver.create_street_level_table(STREET_LEVEL_TRACEROUTES_TABLE) 47 | clickhouse_driver.execute(query) 48 | logger.info(f"table {STREET_LEVEL_TRACEROUTES_TABLE} created") 49 | 50 | ################################################################################################## 51 | # INSERT REPRO DATA # 52 | ################################################################################################## 53 | 54 | # table names 55 | tables = [ 56 | ANCHORS_MESHED_TRACEROUTE_TABLE, 57 | PROBES_TO_ANCHORS_PING_TABLE, 58 | ANCHORS_TO_PREFIX_TABLE, 59 | PROBES_TO_PREFIX_TABLE, 60 | ANCHORS_MESHED_PING_TABLE, 61 | TARGET_TO_LANDMARKS_PING_TABLE, 62 | STREET_LEVEL_TRACEROUTES_TABLE, 63 | ] 64 | 65 | # measurements files_path 66 | file_paths = [ 67 | ANCHORS_MESHED_TRACEROUTE_FILE, 68 | PROBES_TO_ANCHORS_PING_FILE, 69 | ANCHORS_TO_PREFIX_FILE, 70 | PROBES_TO_PREFIX_FILE, 71 | ANCHORS_MESHED_PING_FILE, 72 | TARGET_TO_LANDMARKS_PING_FILE, 73 | STREET_LEVEL_TRACEROUTES_FILE, 74 | ] 75 | 76 | for table_name, file_path in zip(tables, file_paths): 77 | logger.info(f"inserting data into {table_name} from {file_path}") 78 | insert_query = clickhouse_driver.insert_native_query(table_name, file_path) 79 | 80 | clickhouse_driver.insert_file(insert_query) 81 | 82 | ################################################################################################## 83 | # CREATE USER MEASUREMENT TABLES # 84 | ################################################################################################## 85 | 86 | query = clickhouse_driver.create_target_ping_tables(USER_VPS_TO_TARGET_TABLE) 87 | clickhouse_driver.execute(query) 88 | logger.info(f"table {USER_VPS_TO_TARGET_TABLE} created") 89 | 90 | query = clickhouse_driver.create_target_ping_tables(USER_MESHED_TABLE) 91 | clickhouse_driver.execute(query) 92 | logger.info(f"table {USER_MESHED_TABLE} created") 93 | 94 | # create prefixes ping table 95 | query = clickhouse_driver.create_prefixes_ping_tables(USER_VPS_TO_PREFIX_TABLE) 96 | clickhouse_driver.execute(query) 97 | logger.info(f"table {USER_VPS_TO_PREFIX_TABLE} created") 98 | 99 | query = clickhouse_driver.create_prefixes_ping_tables( 100 | USER_TARGET_TO_LANDMARKS_PING_TABLE 101 | ) 102 | clickhouse_driver.execute(query) 103 | logger.info(f"table {USER_TARGET_TO_LANDMARKS_PING_TABLE} created") 104 | 105 | # create traceroute table 106 | query = clickhouse_driver.create_traceroutes_table( 107 | USER_ANCHORS_MESHED_TRACEROUTE_TABLE 108 | ) 109 | clickhouse_driver.execute(query) 110 | logger.info(f"table {USER_ANCHORS_MESHED_TRACEROUTE_TABLE} created") 111 | 112 | # Create street level db 113 | query = clickhouse_driver.create_street_level_table( 114 | USER_STREET_LEVEL_TRACEROUTES_TABLE 115 | ) 116 | clickhouse_driver.execute(query) 117 | logger.info(f"table {USER_STREET_LEVEL_TRACEROUTES_TABLE} created") 118 | -------------------------------------------------------------------------------- /scripts/utils/credentials.py: -------------------------------------------------------------------------------- 1 | """get all credentials (Clickhouse and RIPE)""" 2 | 3 | import json 4 | import os 5 | 6 | from logger import logger 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | 12 | def get_clickhouse_credentials() -> dict: 13 | """return clickhouse credentials""" 14 | 15 | # try to get credentials with env var directly 16 | try: 17 | return { 18 | "base_url": os.environ["CLICKHOUSE_BASE_URL"], 19 | "user": os.environ["CLICKHOUSE_USER"], 20 | "password": os.environ["CLICKHOUSE_PASSWORD"], 21 | } 22 | 23 | except KeyError as e: 24 | logger.error( 25 | f"Missing credentials for interacting with IRIS API (set: CLICKHOUSE_BASE_URL | CLICKHOUSE_USERNAME | CLICKHOUSE_PASSWORD): {e}" 26 | ) 27 | 28 | 29 | def get_ripe_atlas_credentials() -> dict: 30 | """return ripe credentials""" 31 | try: 32 | return { 33 | "username": os.environ["RIPE_USERNAME"], 34 | "secret_key": os.environ["RIPE_SECRET_KEY"], 35 | } 36 | 37 | except KeyError as e: 38 | logger.error( 39 | f"Missing credentials for interacting with IRIS API (set: CLICKHOUSE_BASE_URL | CLICKHOUSE_USERNAME | CLICKHOUSE_PASSWORD): {e}" 40 | ) 41 | -------------------------------------------------------------------------------- /scripts/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | """Functions to load and save data into a json format. 2 | All the paths are given in default.py file. 3 | """ 4 | import ujson as json 5 | 6 | from pathlib import Path 7 | 8 | 9 | def load_json(file_path: Path): 10 | # check that dirs exits 11 | if not file_path.parent.exists(): 12 | file_path.parent.mkdir(parents=True, exist_ok=True) 13 | 14 | with open(file_path) as f: 15 | return json.load(f) 16 | 17 | 18 | def dump_json(data, file_path: Path): 19 | """dump data to output file""" 20 | # check that dirs exits 21 | if not file_path.parent.exists(): 22 | file_path.parent.mkdir(parents=True, exist_ok=True) 23 | 24 | with open(file_path, "w") as f: 25 | json.dump(data, f, indent=4) 26 | 27 | 28 | # def append_results(data, file_path: Paths) 29 | -------------------------------------------------------------------------------- /scripts/utils/helpers.py: -------------------------------------------------------------------------------- 1 | # Mathematical functions helpful for geolocation problems 2 | 3 | import itertools 4 | import numpy as np 5 | 6 | from math import asin, cos, log, radians, sin, sqrt, pi 7 | 8 | 9 | def internet_speed(rtt, speed_threshold): 10 | if speed_threshold is not None: 11 | return speed_threshold 12 | 13 | if rtt >= 80: 14 | speed_threshold = 4 / 9 15 | if rtt >= 5 and rtt < 80: 16 | speed_threshold = 3 / 9 17 | if rtt >= 0 and rtt < 5: 18 | speed_threshold = 1 / 6 19 | 20 | return speed_threshold 21 | 22 | 23 | def rtt_to_km(rtt, speed_threshold=None, c=300): 24 | return internet_speed(rtt, speed_threshold) * rtt * c / 2 25 | 26 | 27 | def is_within_cirle(vp_geo, rtt, candidate_geo, speed_threshold=None): 28 | d = rtt_to_km(rtt, speed_threshold) 29 | d_vp_candidate = haversine(vp_geo, candidate_geo) 30 | if d < d_vp_candidate: 31 | return False 32 | else: 33 | return True 34 | 35 | 36 | def geo_to_cartesian(lat, lon): 37 | lat *= np.pi / 180 38 | lon *= np.pi / 180 39 | 40 | x = np.cos(lon) * np.cos(lat) 41 | y = np.sin(lon) * np.cos(lat) 42 | z = np.sin(lat) 43 | 44 | return x, y, z 45 | 46 | 47 | def check_circle_inclusion(c_1, c_2): 48 | lat_1, lon_1, rtt_1, d_1, r_1 = c_1 49 | lat_2, lon_2, rtt_2, d_2, r_2 = c_2 50 | d = haversine((lat_1, lon_1), (lat_2, lon_2)) 51 | if d_1 > (d + d_2): 52 | return c_1, c_2 53 | elif d_2 > (d + d_1): 54 | return c_2, c_1 55 | return None, None 56 | 57 | 58 | def circle_preprocessing(circles, speed_threshold=None): 59 | circles_to_ignore = set() 60 | 61 | circles_with_r_info = [] 62 | for c in circles: 63 | lat, lon, rtt, d, r = c 64 | if d is None: 65 | d = rtt_to_km(rtt, speed_threshold) 66 | if r is None: 67 | r = d / 6371 68 | circles_with_r_info.append((lat, lon, rtt, d, r)) 69 | 70 | for i in range(len(circles_with_r_info)): 71 | c_1 = circles_with_r_info[i] 72 | if c_1 in circles_to_ignore: 73 | continue 74 | lat_1, lon_1, rtt_1, d_1, r_1 = c_1 75 | for j in range(i + 1, len(circles_with_r_info)): 76 | c_2 = circles_with_r_info[j] 77 | if c_2 in circles_to_ignore: 78 | continue 79 | lat_2, lon_2, rtt_2, d_2, r_2 = c_2 80 | remove, keep = check_circle_inclusion( 81 | (lat_1, lon_1, rtt_1, d_1, r_1), (lat_2, lon_2, rtt_2, d_2, r_2) 82 | ) 83 | if remove: 84 | circles_to_ignore.add(remove) 85 | 86 | circles_to_keep = set(circles_with_r_info) - circles_to_ignore 87 | 88 | return circles_to_keep 89 | 90 | 91 | def get_points_on_circle(lat_c, lon_c, r_c, nb_points: int = 4): 92 | """from a circle, return a set of points""" 93 | circle_points = [] 94 | for k in range(nb_points): 95 | # compute 96 | angle = pi * 2 * k / nb_points 97 | dx = r_c * 1000 * cos(angle) 98 | dy = r_c * 1000 * sin(angle) 99 | lat = lat_c + (180 / pi) * (dy / 6378137) 100 | lon = lon_c + (180 / pi) * (dx / 6378137) / cos(lat_c * pi / 180) 101 | 102 | circle_points.append((lat, lon)) 103 | 104 | return circle_points 105 | 106 | 107 | def circle_intersections(circles, speed_threshold=None): 108 | """ 109 | Check out this link for more details about the maths: 110 | https://gis.stackexchange.com/questions/48937/calculating-intersection-of-two-circles 111 | """ 112 | intersect_points = [] 113 | 114 | circles = circle_preprocessing(circles, speed_threshold=speed_threshold) 115 | 116 | if len(circles) == 1: 117 | single_circle = list(circles)[0] 118 | lat, lon, rtt, d, r = single_circle 119 | filtered_points = get_points_on_circle(lat, lon, d) 120 | return filtered_points, circles 121 | 122 | for c_1, c_2 in itertools.combinations(circles, 2): 123 | lat_1, lon_1, rtt_1, d_1, r_1 = c_1 124 | lat_2, lon_2, rtt_2, d_2, r_2 = c_2 125 | 126 | x1 = np.array(list(geo_to_cartesian(lat_1, lon_1))) 127 | x2 = np.array(list(geo_to_cartesian(lat_2, lon_2))) 128 | 129 | q = np.dot(x1, x2) 130 | 131 | a = (np.cos(r_1) - np.cos(r_2) * q) / (1 - (q**2)) 132 | b = (np.cos(r_2) - np.cos(r_1) * q) / (1 - (q**2)) 133 | 134 | x0 = a * x1 + b * x2 135 | 136 | n = np.cross(x1, x2) 137 | if (1 - np.dot(x0, x0)) / np.dot(n, n) <= 0: 138 | # print("ANYCAST???", (lat_1, lon_1, rtt_1, d_1), (lat_2, lon_2, rtt_2, d_2)) 139 | continue 140 | 141 | t = np.sqrt((1 - np.dot(x0, x0)) / np.dot(n, n)) 142 | 143 | i1 = x0 + t * n 144 | i2 = x0 - t * n 145 | 146 | i_lon_1 = np.arctan2(i1[1], i1[0]) * (180 / np.pi) 147 | i_lat_1 = np.arctan(i1[2] / np.sqrt((i1[0] ** 2) + (i1[1] ** 2))) / ( 148 | np.pi / 180 149 | ) 150 | intersect_points.append((i_lat_1, i_lon_1)) 151 | 152 | i_lon_2 = np.arctan2(i2[1], i2[0]) * (180 / np.pi) 153 | i_lat_2 = np.arctan(i2[2] / np.sqrt((i2[0] ** 2) + (i2[1] ** 2))) / ( 154 | np.pi / 180 155 | ) 156 | intersect_points.append((i_lat_2, i_lon_2)) 157 | 158 | filtred_points = [] 159 | for point_geo in intersect_points: 160 | for lat_c, long_c, rtt_c, d_c, r_c in circles: 161 | if not is_within_cirle((lat_c, long_c), rtt_c, point_geo, speed_threshold): 162 | break 163 | else: 164 | filtred_points.append(point_geo) 165 | 166 | return filtred_points, circles 167 | 168 | 169 | def polygon_centroid(points): 170 | """ 171 | Compute polygon centroid using Finit Set of point method. 172 | (see https://en.wikipedia.org/wiki/Centroid#Of_a_finite_set_of_points) 173 | """ 174 | x = 0 175 | y = 0 176 | for point in points: 177 | x += point[0] 178 | y += point[1] 179 | return x / len(points), y / len(points) 180 | 181 | 182 | def haversine(input_location, block_location): 183 | """Distance between two locations in earth.""" 184 | in_lat, in_lon, block_lat, block_lon = map( 185 | np.radians, [*input_location, *block_location] 186 | ) 187 | 188 | dlat = block_lat - in_lat 189 | dlon = block_lon - in_lon 190 | 191 | distances = ( 192 | np.sin(dlat / 2.0) ** 2 193 | + np.cos(in_lat) * np.cos(block_lat) * np.sin(dlon / 2.0) ** 2 194 | ) 195 | 196 | return 6367 * 2 * np.arcsin(np.sqrt(distances)) 197 | 198 | 199 | def distance(lat1, lat2, lon1, lon2): 200 | lon1 = radians(lon1) 201 | lon2 = radians(lon2) 202 | lat1 = radians(lat1) 203 | lat2 = radians(lat2) 204 | 205 | # Haversine formula 206 | dlon = lon2 - lon1 207 | dlat = lat2 - lat1 208 | a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2 209 | 210 | c = 2 * asin(sqrt(a)) 211 | 212 | r = 6371 213 | 214 | return c * r 215 | 216 | 217 | def get_middle_intersection(intersections): 218 | """in case of only two intersection points, return the middle segment""" 219 | (lat1, lon1) = intersections[0] 220 | (lat2, lon2) = intersections[1] 221 | 222 | # convert to radians 223 | lon1 = radians(lon1) 224 | lon2 = radians(lon2) 225 | lat1 = radians(lat1) 226 | lat2 = radians(lat2) 227 | 228 | # calculate the middle of two points 229 | Bx = np.cos(lat2) * np.cos(lon2 - lon1) 230 | By = np.cos(lat2) * np.sin(lon2 - lon1) 231 | latMid = np.arctan2( 232 | np.sin(lat1) + np.sin(lat2), 233 | np.sqrt((np.cos(lat1) + Bx) * (np.cos(lat1) + Bx) + By * By), 234 | ) 235 | lonMid = lon1 + np.arctan2(By, np.cos(lat1) + Bx) 236 | 237 | # convert back to degrees 238 | latMid = latMid * (180 / pi) 239 | lonMid = lonMid * (180 / pi) 240 | 241 | return latMid, lonMid 242 | 243 | 244 | def select_best_guess_centroid(target_ip, vp_coordinates_per_ip, rtt_per_vp_to_target): 245 | """ 246 | Find the best guess 247 | that is the location of the vantage point closest to the centroid. 248 | """ 249 | probe_circles = {} 250 | closest_vp = None 251 | min_rtt_per_vp_ip = {} 252 | for vp_ip, rtts in rtt_per_vp_to_target.items(): 253 | if target_ip == vp_ip: 254 | continue 255 | if vp_ip not in vp_coordinates_per_ip: 256 | continue 257 | lat, lon = vp_coordinates_per_ip[vp_ip] 258 | min_rtt = min(rtts) 259 | if min_rtt > 100: 260 | continue 261 | min_rtt_per_vp_ip[vp_ip] = min_rtt 262 | # too inflated RTT means that measurement will not provide useful info 263 | 264 | if isinstance(min_rtt, float): 265 | probe_circles[vp_ip] = ( 266 | lat, 267 | lon, 268 | min_rtt, 269 | None, 270 | None, 271 | ) 272 | # print(f"vp_anchor = {vp_ip} with results: {min_rtt}") 273 | # print() 274 | 275 | # draw circles 276 | if not probe_circles: 277 | return None 278 | circles = list(probe_circles.values()) 279 | intersections, circles = circle_intersections(circles, speed_threshold=2/3) 280 | if len(intersections) > 2: 281 | centroid = polygon_centroid(intersections) 282 | elif len(intersections) == 2: 283 | # only two circles intersection, centroid is middle of the segment 284 | centroid = get_middle_intersection(intersections) 285 | else: 286 | # only one circle so take the closest vp as the centroid 287 | closest_vp, _ = min(min_rtt_per_vp_ip.items(), key=lambda x: x[1]) 288 | centroid = vp_coordinates_per_ip[closest_vp] 289 | 290 | return centroid, circles 291 | 292 | 293 | def get_center_of_poly(circles, speed): 294 | points, circles = circle_intersections(circles, speed) 295 | if len(points) == 0: 296 | return None, None 297 | return polygon_centroid(points) 298 | 299 | 300 | def get_points_in_poly(circles, rot, rad, speed, old_circles=[]): 301 | circles = circle_preprocessing(circles, speed_threshold=speed) 302 | points, circles = circle_intersections(circles, speed) 303 | if len(points) == 0: 304 | return [] 305 | else: 306 | center = polygon_centroid(points) 307 | res = [center] 308 | iter_rad = 0 309 | points_added = True 310 | while points_added: 311 | iter_rad += rad 312 | points_added = False 313 | to_add_points = get_points_on_circle( 314 | center[0], center[1], iter_rad, int(360/rot)) 315 | for point in to_add_points: 316 | all_in = True 317 | for vp in circles: 318 | if not is_within_cirle((vp[0], vp[1]), vp[2], point, speed): 319 | all_in = False 320 | break 321 | if all_in: 322 | for vp in old_circles: 323 | if not is_within_cirle((vp[0], vp[1]), vp[2], point, speed): 324 | all_in = False 325 | break 326 | if all_in: 327 | points_added = True 328 | res.append(point) 329 | return res 330 | 331 | def greedy_selection_probes_impl(probe, distance_per_probe, selected_probes): 332 | 333 | distances_log = [log(distance_per_probe[p]) for p in selected_probes 334 | if p in distance_per_probe and distance_per_probe[p] > 0] 335 | total_distance = sum(distances_log) 336 | return probe, total_distance 337 | 338 | -------------------------------------------------------------------------------- /scripts/utils/measurement_utils.py: -------------------------------------------------------------------------------- 1 | """functions for running measurements""" 2 | 3 | import random 4 | import time 5 | 6 | from datetime import datetime 7 | from uuid import UUID 8 | from pathlib import Path 9 | from dateutil import parser 10 | 11 | from logger import logger 12 | from scripts.utils.file_utils import load_json, dump_json 13 | from scripts.ripe_atlas.atlas_api import get_prefix_from_ip, get_measurements_from_tag 14 | from scripts.ripe_atlas.ping_and_traceroute_classes import PING 15 | from scripts.utils.clickhouse import Clickhouse 16 | 17 | from default import ( 18 | PREFIX_MEASUREMENT_RESULTS, 19 | TARGET_MEASUREMENT_RESULTS, 20 | USER_VPS_TO_PREFIX_TABLE, 21 | USER_VPS_TO_TARGET_TABLE, 22 | ) 23 | 24 | 25 | def load_targets(target_file_path: Path, nb_target: int = -1) -> list: 26 | """get a file as entry, return a list of ip target""" 27 | targets = load_json(target_file_path) 28 | 29 | if nb_target > len(targets) or nb_target < 0: 30 | nb_target = len(targets) 31 | 32 | subset_targets = random.sample(targets, k=nb_target) 33 | 34 | return subset_targets 35 | 36 | 37 | def load_vps(vps_file_path: Path, nb_vps: int = -1) -> list: 38 | """load vps from file, return list of vps""" 39 | vps = load_json(vps_file_path) 40 | 41 | if nb_vps > len(vps) or nb_vps < 0: 42 | nb_vps = len(vps) 43 | 44 | subset_vps = random.sample(vps, k=nb_vps) 45 | 46 | return subset_vps 47 | 48 | 49 | def get_measurement_config( 50 | experiment_uuid: UUID, 51 | prefix_measurement_uuid: UUID, 52 | target_measurement_uuid: UUID, 53 | targets: list, 54 | target_prefixes: list, 55 | vps: dict, 56 | dry_run=False, 57 | ) -> dict: 58 | """return measurement config for future retrieval""" 59 | return { 60 | "experiment_uuid": str(experiment_uuid), 61 | "status": "ongoing", 62 | "start_time": str(datetime.now()), 63 | "end_time": None, 64 | "is_dry_run": dry_run, 65 | "nb_targets": len(targets), 66 | "nb_vps": len(vps), 67 | "description": "measurements from a set of vps towards all targets/target prefixes", 68 | "af": 4, 69 | "target_measurements": { 70 | "measurement_uuid": str(target_measurement_uuid), 71 | "targets": targets, 72 | "vps": vps, 73 | "end_time": None, 74 | }, 75 | "prefix_measurements": { 76 | "measurement_uuid": str(prefix_measurement_uuid), 77 | "targets": target_prefixes, 78 | "vps": vps, 79 | "end_time": None, 80 | }, 81 | } 82 | 83 | 84 | def save_measurement_config(measurement_config: dict, out_path: Path) -> None: 85 | """save measurement config""" 86 | 87 | try: 88 | if ( 89 | measurement_config["prefix_measurements"]["end_time"] is not None 90 | and measurement_config["target_measurements"]["end_time"] is not None 91 | ): 92 | measurement_config["end_time"] = str(datetime.now()) 93 | measurement_config["status"] = "finished" 94 | except KeyError: 95 | pass 96 | 97 | dump_json(measurement_config, out_path) 98 | 99 | 100 | def get_target_prefixes(targets: list) -> list: 101 | """from a set of targets ip addresses return their /24 prefixes""" 102 | return [get_prefix_from_ip(target_addr) for target_addr in targets] 103 | 104 | 105 | def ping_prefixes( 106 | measurement_uuid: UUID, 107 | measurement_config: dict, 108 | target_prefixes: list, 109 | targets_per_prefix: dict[list], 110 | vps: list[dict], 111 | dry_run: bool = False, 112 | use_cache: bool = True, 113 | cache_file: Path = PREFIX_MEASUREMENT_RESULTS, 114 | ) -> None: 115 | """ping all targets prefixes from all vps""" 116 | 117 | pinger = PING() 118 | 119 | try: 120 | # load cached prefix results in case measurement was interrupted 121 | if use_cache: 122 | cached_results = load_json(cache_file) 123 | 124 | if cached_results: 125 | logger.info( 126 | f"initial length targets: {len(targets_per_prefix)}, cached measurements : {len(cached_results)}" 127 | ) 128 | 129 | # get prefixes out of targets 130 | cached_results = [ 131 | get_prefix_from_ip(target["dst_addr"]) for target in cached_results 132 | ] 133 | for subnet in cached_results: 134 | if subnet not in targets_per_prefix: 135 | continue 136 | targets_per_prefix.pop(subnet) 137 | 138 | logger.info( 139 | f"after removing cached: {len(targets_per_prefix)}, cached measurements : {len(cached_results)}" 140 | ) 141 | except FileNotFoundError: 142 | logger.info("No cached results available") 143 | pass 144 | 145 | logger.info( 146 | f"Starting measurements {str(measurement_uuid)} with parameters: dry_run={dry_run}; nb_targets={len(target_prefixes)}; nb_vps={len(vps)}." 147 | ) 148 | 149 | # measurement for 3 targets in every target prefixes 150 | ids, start_time, end_time = pinger.ping_by_prefix( 151 | target_prefixes=target_prefixes, 152 | vps=vps, 153 | targets_per_prefix=targets_per_prefix, 154 | tag=measurement_uuid, 155 | dry_run=dry_run, 156 | ) 157 | 158 | # overwrite ids 159 | if "ids" in measurement_config["prefix_measurements"]: 160 | ids.extend(measurement_config["prefix_measurements"]["ids"]) 161 | 162 | measurement_config["prefix_measurements"]["start_time"] = start_time 163 | measurement_config["prefix_measurements"]["end_time"] = end_time 164 | 165 | 166 | def ping_targets( 167 | measurement_uuid: UUID, 168 | measurement_config: dict, 169 | targets: list[dict], 170 | vps: list[dict], 171 | dry_run: bool = False, 172 | use_cache: bool = True, 173 | cache_file: Path = TARGET_MEASUREMENT_RESULTS, 174 | ) -> None: 175 | """ping all targets using all vps""" 176 | 177 | pinger = PING() 178 | 179 | targets = [t["address_v4"] for t in targets] 180 | 181 | try: 182 | if use_cache: 183 | cached_results = load_json(cache_file) 184 | logger.info( 185 | f"initial length targets: {len(targets)}, cached measurements : {len(cached_results)}" 186 | ) 187 | 188 | cached_results = [c["dst_addr"] for c in cached_results] 189 | 190 | targets = list(set(targets).difference(set(cached_results))) 191 | 192 | logger.info( 193 | f"after removing cached: {len(targets)}, cached measurements : {len(cached_results)}" 194 | ) 195 | except FileNotFoundError: 196 | logger.info("No cached results available") 197 | pass 198 | 199 | logger.info( 200 | f"Starting measurements {str(measurement_uuid)} with parameters: dry_run={dry_run}; nb_targets={len(targets)}; nb_vps={len(vps)}." 201 | ) 202 | 203 | ids, start_time, end_time = pinger.ping_by_target( 204 | targets=targets, vps=vps, tag=measurement_uuid, dry_run=dry_run 205 | ) 206 | 207 | # overwrite ids 208 | if "ids" in measurement_config["target_measurements"]: 209 | ids.extend(measurement_config["target_measurements"]["ids"]) 210 | 211 | measurement_config["target_measurements"]["start_time"] = start_time 212 | measurement_config["target_measurements"]["end_time"] = end_time 213 | 214 | 215 | def get_latest_measurements(config_path: Path) -> dict: 216 | """retrieve latest measurement config""" 217 | try: 218 | assert config_path.is_dir() 219 | except AssertionError: 220 | logger.error(f"config path is not a dir: {config_path}") 221 | 222 | latest: datetime = None 223 | for file in config_path.iterdir(): 224 | measurement_config = load_json(file) 225 | if latest: 226 | if latest < parser.isoparse(measurement_config["start_time"]): 227 | latest_config = measurement_config 228 | else: 229 | latest = parser.isoparse(measurement_config["start_time"]) 230 | latest_config = measurement_config 231 | 232 | return latest_config 233 | 234 | 235 | def retrieve_results( 236 | measurement_uuid: str, 237 | out_file: Path, 238 | ) -> None: 239 | """query RIPE Atlas API to retrieve all measurement results""" 240 | # fetch results on API 241 | measurement_results = get_measurements_from_tag(measurement_uuid) 242 | 243 | logger.info( 244 | f"nb measurements retrieved: {len(measurement_results)} for measurement_uuid : {measurement_uuid}" 245 | ) 246 | 247 | # save results in cache file 248 | dump_json(measurement_results, out_file) 249 | 250 | return measurement_results 251 | 252 | 253 | def insert_prefix_results(results: list) -> None: 254 | """insert prefixes results with CSV value method""" 255 | rows = [] 256 | values_description = ( 257 | "src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto" 258 | ) 259 | 260 | if not results: 261 | raise RuntimeError(f"no data to insert, data = {result}") 262 | 263 | for result in results: 264 | try: 265 | # parse response 266 | src = result["src_addr"] 267 | dst = result["dst_addr"] 268 | prb_id = result["prb_id"] 269 | date = result["timestamp"] 270 | sent = result["sent"] 271 | rcvd = result["rcvd"] 272 | rtts = ( 273 | [rtt["rtt"] for rtt in result["result"]] 274 | if "rtt" in result["result"] 275 | else [-1] 276 | ) 277 | min = result["min"] 278 | mean = result["avg"] 279 | msm_id = result["msm_id"] 280 | proto = 0 281 | 282 | row = [src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto] 283 | 284 | rows.append(row) 285 | except KeyError as e: 286 | logger.warning(f"Some measurements does not contain results: {e}") 287 | 288 | clickhouse = Clickhouse() 289 | query = clickhouse.insert_from_values_query( 290 | USER_VPS_TO_PREFIX_TABLE, values_description 291 | ) 292 | clickhouse.insert_from_values(query, rows) 293 | 294 | logger.info( 295 | f"Prefix measurements successfully inserted in table : {USER_VPS_TO_PREFIX_TABLE}" 296 | ) 297 | 298 | 299 | def insert_target_results(results: list) -> None: 300 | """insert prefixes results with CSV value method""" 301 | rows = [] 302 | values_description = ( 303 | "src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto" 304 | ) 305 | for result in results: 306 | # parse response 307 | src = result["src_addr"] 308 | dst = result["dst_addr"] 309 | prb_id = result["prb_id"] 310 | date = result["timestamp"] 311 | sent = result["sent"] 312 | rcvd = result["rcvd"] 313 | rtts = ( 314 | [rtt["rtt"] for rtt in result["result"]] 315 | if "rtt" in result["result"] 316 | else [-1] 317 | ) 318 | min = result["min"] 319 | mean = result["avg"] 320 | msm_id = result["msm_id"] 321 | proto = 0 322 | 323 | row = [src, dst, prb_id, date, sent, rcvd, rtts, min, mean, msm_id, proto] 324 | 325 | rows.append(row) 326 | 327 | clickhouse = Clickhouse() 328 | query = clickhouse.insert_from_values_query( 329 | USER_VPS_TO_TARGET_TABLE, values_description 330 | ) 331 | clickhouse.insert_from_values(query, rows) 332 | 333 | logger.info( 334 | f"Target measurements successfully inserted in table : {USER_VPS_TO_TARGET_TABLE}" 335 | ) 336 | -------------------------------------------------------------------------------- /scripts/utils/plot_utils.py: -------------------------------------------------------------------------------- 1 | """Functions to plot figures in a nice way""" 2 | 3 | from matplotlib.patches import Polygon 4 | from matplotlib.lines import Line2D 5 | from pathlib import Path 6 | import matplotlib.pyplot as plt 7 | import matplotlib 8 | 9 | matplotlib.use("Agg") 10 | 11 | font = {"weight": "bold", "size": 16} # 'family' : 'normal', 12 | matplotlib.rcParams["pdf.fonttype"] = 42 13 | matplotlib.rcParams["ps.fonttype"] = 42 14 | fontsize_axis = 17 15 | font_size_alone = 14 16 | matplotlib.rc("font", **font) 17 | 18 | markers = ["o", "s", "v", "^"] 19 | linestyles = ["-", "--", "-.", ":"] 20 | 21 | colors_blind = [ 22 | ["blue", (0, 114.0 / 255, 178.0 / 255)], 23 | ["orange", (230.0 / 255, 159.0 / 255, 0)], 24 | ["reddish_purple", (204.0 / 255, 121.0 / 255, 167.0 / 255)], 25 | ["black", (0, 0, 0)], 26 | ["bluish_green", (0, 158.0 / 255, 115.0 / 255)], 27 | ["sky_blue", (86.0 / 255, 180.0 / 255, 233.0 / 255)], 28 | ["vermillon", (213.0 / 255, 94.0 / 255, 0)], 29 | # ["yellow", (240.0 / 255, 228.0 / 255, 66.0 / 255)], 30 | ] 31 | 32 | 33 | def plot_multiple_cdf( 34 | Ys, 35 | n_bins, 36 | xmin, 37 | xmax, 38 | xlabel, 39 | ylabel, 40 | legend, 41 | ymin=0, 42 | ymax=1.05, 43 | xticks=None, 44 | xticks_labels=None, 45 | xscale="linear", 46 | yscale="linear", 47 | cumulative=True, 48 | figure=None, 49 | axes=None, 50 | offset=0, 51 | colors_arg=None, 52 | linestyles_arg=None, 53 | ): 54 | if figure is not None and axes is not None: 55 | fig = figure 56 | ax = axes 57 | else: 58 | subplots = plt.subplots() 59 | fig, ax = subplots 60 | ax.set_xlabel(xlabel, fontsize=fontsize_axis) 61 | ax.set_ylabel(ylabel, fontsize=fontsize_axis) 62 | # title = title + " CDF" 63 | # plt.title("CDF", fontsize=fontsize_axis) 64 | 65 | ax.grid(linestyle="dotted") 66 | if len(Ys) == 1: 67 | i = 0 68 | Y = Ys[i] 69 | if colors_arg is not None: 70 | color = colors_arg[i][1] 71 | else: 72 | color = colors_blind[(i + offset) % len(colors_blind)][1] 73 | 74 | if linestyles_arg is not None: 75 | linestyle = linestyles[i] 76 | else: 77 | linestyle = linestyles[(i + offset) % len(linestyles)] 78 | 79 | n, bins, patches = ax.hist( 80 | Y, 81 | density=True, 82 | histtype="step", 83 | bins=n_bins, 84 | cumulative=cumulative, 85 | linewidth=1.35, 86 | color=color, 87 | linestyle=linestyle, 88 | ) 89 | patches[0].set_xy(patches[0].get_xy()[1:-1]) 90 | else: 91 | for i in range(0, len(Ys)): 92 | Y = Ys[i] 93 | if colors_arg is not None: 94 | color = colors_arg[i][1] 95 | else: 96 | color = colors_blind[(i + offset) % len(colors_blind)][1] 97 | 98 | if linestyles_arg is not None: 99 | linestyle = linestyles_arg[i] 100 | else: 101 | linestyle = linestyles[(i + offset) % len(linestyles)] 102 | 103 | n, bins, patches = ax.hist( 104 | Y, 105 | density=True, 106 | histtype="step", 107 | bins=n_bins, 108 | cumulative=cumulative, 109 | linewidth=1.35, 110 | label=legend[i], 111 | color=color, 112 | linestyle=linestyle, 113 | ) 114 | patches[0].set_xy(patches[0].get_xy()[1:-1]) 115 | 116 | # plt.xscale("symlog") 117 | # xticks = ax.xaxis.get_major_ticks() 118 | # xticks[1].label1.set_visible(False) 119 | # # xticks[2].label1.set_visible(False) 120 | # xticks[-2].label1.set_visible(False) 121 | ax.set_xscale(xscale) 122 | ax.set_yscale(yscale) 123 | ax.set_xlim(left=xmin, right=xmax) 124 | ax.set_ylim(bottom=ymin, top=ymax) 125 | if xticks is not None: 126 | ax.set_xticks(xticks) 127 | # xtickNames = plt.setp(ax, xticklabels=[f"{r}" for r in x_ticks]) 128 | if xticks_labels is not None: 129 | ax.set_xticklabels(xticks_labels) 130 | 131 | # Normalize the data to a proper PDF 132 | # plt.tight_layout() 133 | # plt.savefig(r"resources/figures/" + ofile + ".pdf") 134 | return fig, ax 135 | 136 | 137 | def plot_multiple_error_bars( 138 | X, Ys, Yerrs, xmin, xmax, ymin, ymax, xlabel, ylabel, xscale, yscale, labels 139 | ): 140 | fig, ax = plt.subplots() 141 | ax.set_xlabel(xlabel, fontsize=fontsize_axis) 142 | 143 | ax.set_ylabel(ylabel, fontsize=fontsize_axis) 144 | ax.grid(linestyle="dotted") 145 | 146 | # x_ticks = [inf_born+1] 147 | for i in range(len(Ys)): 148 | Y = Ys[i] 149 | Yerr = Yerrs[i] 150 | lns1 = ax.errorbar( 151 | X, 152 | Y, 153 | Yerr, 154 | label=labels[i], 155 | linewidth=0.5, 156 | marker=markers[i % len(markers)], 157 | markersize=1, 158 | markeredgewidth=1, 159 | capsize=2, 160 | ) 161 | ax.set_xscale(xscale) 162 | ax.set_yscale(yscale) 163 | ax.set_xlim(left=xmin, right=xmax) 164 | ax.set_ylim(bottom=ymin, top=ymax) 165 | return fig, ax 166 | 167 | 168 | def plot_save(ofile: Path, is_tight_layout): 169 | # check that dirs exits 170 | if not ofile.parent.exists(): 171 | ofile.parent.mkdir(parents=True, exist_ok=True) 172 | 173 | if is_tight_layout: 174 | plt.tight_layout() 175 | # plt.show() 176 | plt.savefig(ofile) 177 | 178 | # plt.clf() 179 | 180 | 181 | def homogenize_legend(ax, legend_location, legend_size=14): 182 | handles, labels = ax.get_legend_handles_labels() 183 | new_handles = [] 184 | for h in handles: 185 | if isinstance(h, Line2D): 186 | new_handles.append(h) 187 | elif isinstance(h, Polygon): 188 | new_handles.append( 189 | Line2D([], [], linestyle=h.get_linestyle(), color=h.get_edgecolor()) 190 | ) 191 | ax.legend( 192 | loc=legend_location, 193 | prop={"size": legend_size}, 194 | handles=new_handles, 195 | labels=labels, 196 | ) 197 | 198 | 199 | def plot_scatter_multiple( 200 | Xs, 201 | Ys, 202 | xmin, 203 | xmax, 204 | ymin, 205 | ymax, 206 | xscale, 207 | yscale, 208 | xlabel, 209 | ylabel, 210 | markers, 211 | marker_colors, 212 | marker_size, 213 | ): 214 | fig, ax = plt.subplots() 215 | 216 | # ax.set_xlabel(title, fontsize=fontsize_axis) 217 | # plt.title("CDF", fontsize=fontsize_axis) 218 | 219 | # x_ticks = [inf_born] 220 | # x_ticks.extend(np.arange(inf_born, sup_born, xtick_interval)) 221 | # ax.set_xticks(x_ticks) 222 | # xtickNames = plt.setp(ax, xticklabels=["{0:.1f}".format(r) for r in x_ticks]) 223 | # ax.set_xticklabels(xtickNames, rotation=45) 224 | # ax.set_xticklabels(xtickNames) 225 | 226 | ax.grid(linestyle="dotted") 227 | ax.set_xlabel(xlabel, fontsize=fontsize_axis) 228 | ax.set_ylabel(ylabel, fontsize=fontsize_axis) 229 | 230 | for i in range(0, len(Xs)): 231 | X = Xs[i] 232 | Y = Ys[i] 233 | 234 | # , markersize=10, markeredgewidth=2) 235 | ax.scatter(X, Y, c=marker_colors[i], marker=markers[i], s=marker_size[i]) 236 | # ax.plot(X, Y) 237 | # patches[0].set_xy(patches[0].get_xy()[:-1]) 238 | ax.set_xscale(xscale) 239 | ax.set_yscale(yscale) 240 | 241 | ax.set_xlim(left=xmin, right=xmax) 242 | ax.set_ylim(bottom=ymin, top=ymax) 243 | 244 | return fig, ax 245 | --------------------------------------------------------------------------------