├── .flake8 ├── .git-blame-ignore-revs ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── new_dataset.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── pre-commit.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── ACKNOWLEDGMENTS.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── autodeploy-config.json ├── autodeploy ├── README.md └── autodeploy.py ├── basic ├── items.csv └── properties.csv ├── caddy.template.json ├── config.json.example ├── create_db.py ├── docker-compose.yaml ├── documentation ├── README.md ├── assets │ └── gallery │ │ ├── as2497country.svg │ │ ├── as2497domainNames.svg │ │ ├── as2497ixpCountry.svg │ │ ├── as2497names.svg │ │ ├── as2501dependencies.svg │ │ ├── peeringdbAll.svg │ │ ├── prefixAllRelated.svg │ │ ├── top10kJapanAS.svg │ │ └── top20IranAS.svg ├── crawler-best-practices.md ├── data-sources.md ├── database-management.md ├── gallery.md ├── node-types.md ├── relationship-types.md └── writing-a-crawler.md ├── iyp ├── __init__.py ├── crawlers │ ├── __init__.py │ ├── alice_lg │ │ ├── README.md │ │ ├── __init__.py │ │ ├── amsix.py │ │ ├── bcix.py │ │ ├── decix.py │ │ ├── ixbr.py │ │ ├── linx.py │ │ ├── megaport.py │ │ └── netnod.py │ ├── apnic │ │ ├── README.md │ │ └── eyeball.py │ ├── bgpkit │ │ ├── README.md │ │ ├── __init__.py │ │ ├── as2rel_v4.py │ │ ├── as2rel_v6.py │ │ ├── peerstats.py │ │ └── pfx2asn.py │ ├── bgptools │ │ ├── README.md │ │ ├── anycast_prefixes.py │ │ ├── as_names.py │ │ └── tags.py │ ├── caida │ │ ├── README.md │ │ ├── __init__.py │ │ ├── as2org.py │ │ ├── as_relationships_v4.py │ │ ├── as_relationships_v6.py │ │ ├── asrank.py │ │ ├── ix_asns.py │ │ └── ixs.py │ ├── cisco │ │ ├── README.md │ │ └── umbrella_top1m.py │ ├── citizenlab │ │ ├── README.md │ │ └── urldb.py │ ├── cloudflare │ │ ├── README.md │ │ ├── __init__.py │ │ ├── dns_top_ases.py │ │ ├── dns_top_locations.py │ │ ├── ranking_bucket.py │ │ └── top100.py │ ├── emileaben │ │ ├── README.md │ │ └── as_names.py │ ├── example │ │ ├── README.md │ │ └── crawler.py │ ├── google │ │ ├── README.md │ │ └── crux_top1m_country.py │ ├── iana │ │ ├── README.md │ │ └── root_zone.py │ ├── ihr │ │ ├── README.md │ │ ├── __init__.py │ │ ├── country_dependency.py │ │ ├── local_hegemony_v4.py │ │ ├── local_hegemony_v6.py │ │ └── rov.py │ ├── inetintel │ │ ├── README.md │ │ └── as_org.py │ ├── ipinfo │ │ ├── README.md │ │ └── ip_country.py │ ├── manrs │ │ ├── README.md │ │ └── members.py │ ├── nro │ │ ├── README.md │ │ └── delegated_stats.py │ ├── ooni │ │ ├── README.md │ │ ├── __init__.py │ │ ├── facebookmessenger.py │ │ ├── httpheaderfieldmanipulation.py │ │ ├── httpinvalidrequestline.py │ │ ├── osignal.py │ │ ├── psiphon.py │ │ ├── riseupvpn.py │ │ ├── stunreachability.py │ │ ├── telegram.py │ │ ├── tor.py │ │ ├── torsf.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── grabber.py │ │ ├── vanillator.py │ │ ├── webconnectivity.py │ │ └── whatsapp.py │ ├── openintel │ │ ├── README.md │ │ ├── __init__.py │ │ ├── crux.py │ │ ├── dnsgraph_crux.py │ │ ├── dnsgraph_jp.py │ │ ├── dnsgraph_nl.py │ │ ├── dnsgraph_rdns.py │ │ ├── infra_mx.py │ │ ├── infra_ns.py │ │ ├── tranco1m.py │ │ └── umbrella1m.py │ ├── pch │ │ ├── README.md │ │ ├── __init__.py │ │ ├── daily_routing_snapshots_v4.py │ │ ├── daily_routing_snapshots_v6.py │ │ └── show_bgp_parser.py │ ├── peeringdb │ │ ├── README.md │ │ ├── fac.py │ │ ├── ix.py │ │ └── org.py │ ├── ripe │ │ ├── README.md │ │ ├── as_names.py │ │ ├── atlas_measurements.py │ │ ├── atlas_probes.py │ │ └── roa.py │ ├── simulamet │ │ ├── README.md │ │ └── rirdata_rdns.py │ ├── stanford │ │ ├── README.md │ │ └── asdb.py │ ├── tranco │ │ ├── README.md │ │ └── top1m.py │ ├── virginiatech │ │ ├── README.md │ │ └── rovista.py │ └── worldbank │ │ ├── README.md │ │ └── country_pop.py └── post │ ├── address_family.py │ ├── clean_links.py │ ├── country_information.py │ ├── ip2prefix.py │ └── url2hostname.py ├── log └── .gitkeep ├── public ├── README.md ├── conf_notls │ └── neo4j.conf └── conf_tls │ └── neo4j.conf ├── requirements.txt ├── run_all_crawlers.sh ├── send_email.py ├── setup.py └── unit_test.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | max-doc-length = 88 4 | extend-ignore = E741 5 | show-source = true 6 | # autopep8 argument 7 | aggressive = 3 -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # Run this command to always ignore formatting commits in `git blame` 2 | # git config blame.ignoreRevsFile .git-blame-ignore-revs 3 | 4 | # Codebase reformat 5 | eec710884f15bf87b1ebe8872802d8ec573a55a8 6 | # Romain's miscellaneous cleanups before codebase reformat 7 | a9638823aaeeeae99150c93074ac3d90fc2a4673 8 | 5247aee3a5d40deaa9e12c06bc92d1ee22a02174 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "" 5 | labels: "" 6 | assignees: "" 7 | --- 8 | 9 | *NOTE (Delete after reading): There is no need to open bug reports based on 10 | error messages in the log of the weekly database dump. We usually notice them 11 | and can judge if a simple rerun of the crawler suffices (e.g., due to a 12 | temporary connectivity issue), or if there is a bug in the crawler.* 13 | 14 | **Describe the bug** 15 | A clear and concise description of what the bug is. 16 | 17 | **To Reproduce** 18 | Steps to reproduce the behavior or example query to show the bug. 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "" 5 | labels: "" 6 | assignees: "" 7 | --- 8 | 9 | **Is your feature request related to a problem? Please describe.** 10 | A clear and concise description of what the problem is. 11 | 12 | **Describe the solution you'd like** 13 | A clear and concise description of what you want to happen. 14 | 15 | **Describe alternatives you've considered** 16 | A clear and concise description of any alternative solutions or features you've considered. 17 | 18 | **Additional context** 19 | Add any other context or screenshots about the feature request here. 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/new_dataset.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Request new dataset 3 | about: Suggest a dataset to add to the database 4 | title: "" 5 | labels: "" 6 | assignees: "" 7 | --- 8 | 9 | **Explain the dataset you want to add and how it would contribute to the Internet Yellow Pages.** 10 | A clear and concise description of the dataset. 11 | 12 | **Provide the name of the organization providing the data and the url to the dataset** 13 | - Org name: 14 | - URL: 15 | 16 | **If possible describe how you would like to model the dataset in the Yellow Pages** 17 | - What is the label/type of the nodes used by this dataset? 18 | 19 | - What is the label/type of the links for this dataset? 20 | 21 | - If possible give the cypher description of the links that are desired: 22 | 23 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Description 4 | 5 | 6 | 7 | ## Motivation and Context 8 | 9 | 10 | 11 | 12 | ## How Has This Been Tested? 13 | 14 | 15 | 16 | 17 | 18 | ## Screenshots (if appropriate): 19 | 20 | ## Types of changes 21 | 22 | 23 | 24 | - [ ] Bug fix (non-breaking change which fixes an issue) 25 | - [ ] New feature (non-breaking change which adds functionality) 26 | - [ ] Breaking change (fix or feature that would cause existing functionality to change) 27 | 28 | ## Checklist: 29 | 30 | 31 | 32 | 33 | - [ ] My code follows the code style of this project. 34 | - [ ] My change requires a change to the documentation. 35 | - [ ] I have updated the documentation accordingly. 36 | 37 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: Pre-commit CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | pre-commit: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout code 10 | uses: actions/checkout@v4 11 | 12 | - name: Set up Python 13 | uses: actions/setup-python@v5 14 | with: 15 | python-version-file: '.python-version' 16 | cache: 'pip' 17 | 18 | - name: Install dependencies 19 | run: pip install -r requirements.txt 20 | 21 | - name: pre-commit 22 | uses: pre-commit/action@v3.0.0 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Configuration file 2 | config.json 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .history/ 132 | dumps/ 133 | data/ 134 | .vscode/ 135 | neo4j/ 136 | tmp/ 137 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | files: \.py$ 2 | repos: 3 | - repo: https://github.com/PyCQA/autoflake 4 | rev: v2.3.1 5 | hooks: 6 | - id: autoflake 7 | args: [--in-place] 8 | - repo: https://github.com/PyCQA/isort 9 | rev: 6.0.1 10 | hooks: 11 | - id: isort 12 | args: [--thirdparty, neo4j] 13 | - repo: https://github.com/pre-commit/mirrors-autopep8 14 | rev: v2.0.4 15 | hooks: 16 | - id: autopep8 17 | - repo: https://github.com/PyCQA/docformatter 18 | rev: eb1df347edd128b30cd3368dddc3aa65edcfac38 # Don't autoupdate until https://github.com/PyCQA/docformatter/issues/293 is fixed 19 | # rev: v1.7.5 20 | hooks: 21 | - id: docformatter 22 | args: [--in-place, --wrap-summaries, '88', --wrap-descriptions, '88'] 23 | - repo: https://github.com/pre-commit/pre-commit-hooks 24 | rev: v5.0.0 25 | hooks: 26 | - id: double-quote-string-fixer 27 | - id: mixed-line-ending 28 | args: ['--fix', 'lf'] 29 | - repo: https://github.com/PyCQA/flake8 30 | rev: 7.1.2 31 | hooks: 32 | - id: flake8 33 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Internet Health Report 2 | 3 | First off, thanks for taking the time to contribute! 🎉🎉 4 | 5 | When contributing to this repository, please first discuss the change you wish to make via issue 6 | with the maintainers of this repository before making a change. These are mostly guidelines, not 7 | rules. Use your best judgment, and feel free to propose changes to this document in a pull request. 8 | 9 | ## Code of Conduct 10 | 11 | This project and everyone participating in it is governed by the [IHR Code of 12 | Conduct](https://github.com/InternetHealthReport/ihr-website/blob/master/CODE_OF_CONDUCT.md), please 13 | follow it in all your interaction with the project. By participating, you are expected to uphold 14 | this code. Please report unacceptable behavior to admin@ihr.live 15 | 16 | ## Pull Request Process 17 | 18 | 1. Make sure that your code is formatted and passed linting according to the project 19 | requirements. This can easily be achieved by installing the `pre-commit` command as 20 | described below. 21 | 1. Ensure any new dependencies are added to the `requirements.txt` file. 22 | 1. Add only relevant files to the commit and ignore the rest to keep the repo clean. 23 | - If you add a new dataset / crawler, include a README.md describing the crawler 24 | and the nodes / relationships it will push to the database. 25 | - If you change the build process, update the general README.md if required. 26 | 1. You should request review from the maintainers once you submit the Pull Request. 27 | 28 | ## Instructions 29 | 30 | ### Git Workflow 31 | 32 | ```bash 33 | ## Step 1: Fork Repository 34 | 35 | ## Step 2: Git Set Up & Download 36 | # Clone the repo 37 | git clone https://github.com//.git 38 | # Add upstream remote 39 | git remote add upstream https://github.com/InternetHealthReport/internet-yellow-pages.git 40 | # Fetch and merge with upstream/main 41 | git fetch upstream 42 | git merge upstream/main 43 | 44 | ## Step 3: Setup Virtual Environment and Install Dependencies 45 | python3 -m venv --upgrade-deps .venv 46 | source .venv/bin/activate 47 | pip install -r requirements.txt 48 | 49 | ## Step 4: Setup pre-commit 50 | pre-commit install 51 | 52 | ## Step 5: Create and Publish Working Branch 53 | git checkout -b //{} 54 | git push origin //{} 55 | 56 | ## Types: 57 | # wip - Work in Progress; long term work; mainstream changes; 58 | # feat - New Feature; future planned; non-mainstream changes; 59 | # bug - Bug Fixes 60 | # exp - Experimental; random experimental features; 61 | ``` 62 | 63 | ### On Task Completion 64 | 65 | ```bash 66 | ## Commit and Push Your Work 67 | # Check branch 68 | git branch 69 | # Fetch and merge with upstream/main 70 | git fetch upstream 71 | git merge upstream/main 72 | # Add untracked files 73 | git add . 74 | # Commit all changes with appropriate commit message and description 75 | git commit -m "your-commit-message" -m "your-commit-description" 76 | # Fetch and merge with upstream/main again 77 | git fetch upstream 78 | git merge upstream/main 79 | # Push changes to your forked repository 80 | git push origin //{} 81 | ``` 82 | 83 | Create the PR using GitHub Website. 84 | 85 | Create a pull request from `//{}` branch in your forked 86 | repository to the main branch in the upstream repository. 87 | 88 | After creating the PR, add a reviewer (any admin) and yourself as the assignee. Link the PR to 89 | appropriate issue, or Project+Milestone (if no issue was created). 90 | 91 | ### After PR Merge 92 | 93 | ```bash 94 | # Delete branch from forked repo 95 | git branch -d //{} 96 | git push --delete origin //{} 97 | # Fetch and merge with upstream/main 98 | git checkout main 99 | git pull upstream 100 | git push origin 101 | ``` 102 | 103 | - Always follow [commit message standards](https://chris.beams.io/posts/git-commit/) 104 | - About the [fork-and-branch workflow](https://blog.scottlowe.org/2015/01/27/using-fork-branch-git-workflow/) 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Internet Yellow Pages 2 | 3 | The Internet Yellow Pages (IYP) is a knowledge database that gathers information about 4 | Internet resources (for example ASNs, IP prefixes, and domain names). 5 | 6 | ## Public IYP prototype 7 | 8 | Visit to try our online prototype. You will find instructions 9 | on how to connect to the prototype and some example queries there. For even more 10 | examples, check out the [IYP 11 | gallery](documentation/gallery.md). 12 | 13 | ## Deploy a local IYP instance 14 | 15 | We describe the basic process of deploying a local IYP instance below. For more advanced 16 | commands see the [database documentation](documentation/database-management.md). 17 | 18 | ### Prerequisites 19 | 20 | - [Curl](https://curl.se/download.html) 21 | - [Docker](https://www.docker.com/) 22 | - [Docker Compose](https://docs.docker.com/compose/install/) 23 | - about 100GB of free disk space 24 | - at least 2GB of RAM 25 | 26 | ### Download the database dump 27 | 28 | Visit the [database dump repository](https://archive.ihr.live/ihr/iyp/). 29 | 30 | Dumps are organized by year, month, and day in this format: 31 | 32 | ```text 33 | https://archive.ihr.live/ihr/iyp/YYYY/MM/DD/iyp-YYYY-MM-DD.dump 34 | ``` 35 | 36 | Replace `YYYY`, `MM`, and `DD` in the URL with the desired date to access a specific 37 | database dump. 38 | 39 | The dump file needs to be called `neo4j.dump` and needs to be put in a folder called 40 | `dumps` (`dumps/neo4j.dump`). 41 | To create the folder and download a dump with `curl`: 42 | 43 | ```bash 44 | mkdir dumps 45 | curl https://archive.ihr.live/ihr/iyp/YYYY/MM/DD/iyp-YYYY-MM-DD.dump -o dumps/neo4j.dump 46 | ``` 47 | 48 | ### Set up IYP 49 | 50 | To uncompress the dump and start the database run the following command: 51 | 52 | ```bash 53 | mkdir -p data 54 | uid="$(id -u)" gid="$(id -g)" docker compose --profile local up 55 | ``` 56 | 57 | This creates a `data` directory containing the database, load the database dump, and 58 | start the local IYP instance. This initial setup needs be done only once but it takes 59 | some time to completely load the database and start IYP. Please wait until IYP is fully 60 | loaded. Also this step won't work if the data directory already contains a database. 61 | 62 | This setup keeps the database instance running in the foreground. It can be stopped with 63 | `Ctrl+C`. Afterwards, you can simply [start/stop](#startstop-iyp) IYP in the background 64 | to use it. To update the database with a new dump see [Update existing 65 | database](documentation/database-management.md#update-existing-database). 66 | 67 | ### Start/Stop IYP 68 | 69 | To start the database, run the following command: 70 | 71 | ```bash 72 | docker start iyp 73 | ``` 74 | 75 | To stop the database, run the following command: 76 | 77 | ``` bash 78 | docker stop iyp 79 | ``` 80 | 81 | ### Query the database 82 | 83 | Open in your favorite browser. To connect the interface to the database give 84 | the default login and password: `neo4j` and `password` respectively. Then enter your query in the top input field. 85 | 86 | For example, this finds the IXPs and corresponding country codes where IIJ (AS2497) is: 87 | 88 | ```cypher 89 | MATCH (iij:AS {asn:2497})-[:MEMBER_OF]-(ix:IXP)--(cc:Country) 90 | RETURN iij, ix, cc 91 | ``` 92 | 93 | ![Countries of IXPs where AS2497 is present](/documentation/assets/gallery/as2497ixpCountry.svg) 94 | 95 | ### IYP gallery 96 | 97 | See more query examples in [IYP gallery](/documentation/gallery.md) 98 | 99 | ## Contributing 100 | 101 | Want to [propose a new dataset](documentation/README.md#add-new-datasets) or [implement 102 | a crawler](documentation/writing-a-crawler.md)? Checkout the 103 | [documentation](documentation/README.md) for more info. 104 | 105 | ## Changelog 106 | 107 | See: 108 | 109 | ## External links 110 | 111 | - [Public instance of IYP](https://iyp.iijlab.net) 112 | - [RIPE86 presentation](https://ripe86.ripe.net/archives/video/1073/) 113 | - [APNIC blog article](https://blog.apnic.net/2023/09/06/understanding-the-japanese-internet-with-the-internet-yellow-pages/) 114 | -------------------------------------------------------------------------------- /autodeploy-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "archive_base_url": "https://archive.ihr.live/ihr/iyp/", 3 | "caddy_config_url": "http://sandbox.ihr.live:2019/config", 4 | "caddy_post_url": "http://localhost:2019/load", 5 | "caddy_template": "caddy.template.json", 6 | "urls": { 7 | "active_bolt": "ryan-bolt.ihr.live", 8 | "active_http": "ryan.ihr.live", 9 | "prev_bolt": "ryan-prev-bolt.ihr.live", 10 | "prev_http": "ryan-prev.ihr.live" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /autodeploy/README.md: -------------------------------------------------------------------------------- 1 | # Autodeployment Script 2 | 3 | ## Usage 4 | 5 | ### Starting caddy 6 | 7 | Make sure that Caddy is running. If not, run it with `docker compose up caddy`. If Caddy 8 | was running previously, then the new Caddy instance will resume from the previous 9 | config. See the [Caddy docs](https://caddyserver.com/docs/running#docker-compose) for 10 | more info. 11 | 12 | ### Running the script 13 | 14 | To run the script, run `python3 -m autodeploy.autodeploy `. This will first find the date 15 | of the most recent active deployment using the caddy config. If there is no active 16 | deployment, today's date is used. With this date, the script will then check ihr-archive 17 | to see if a dump has been pushed in the subsequent 7 days. If so, a neo4j instance will 18 | be deployed using that dump. For example, if the latest deployment is for 2024-06-15, 19 | the script will check if there is a dump for 2024-06-16 to 2024-06-23. 20 | 21 | Alternatively, running `python3 -m autodeploy.autodeploy --date [year]-[month]-[day]` will 22 | check if there is a dump in the archive for the specified date and deploy it directly. 23 | 24 | ## How it works 25 | 26 | ### Checking for a dump to deploy 27 | 28 | If the date is not provided when running the script, it will first make a request to 29 | Caddy to get the current config. The config is parsed to retrieve the port of the active 30 | database. The date is parsed from the port number as explained below. Starting from this 31 | date, the next 7 days are then checked in ihr-archive for valid dumps. 32 | 33 | #### Caddy Config 34 | 35 | Caddy is updated by substituting the desired ports in the specified Caddy config 36 | template. The ports are constructed with the following structure: 1MMDD for neo4j http 37 | port, and 2MMDD for neo4j bolt port. The json is sent to caddy by making a POST request 38 | to sandbox.ihr.live:2019/load. The current config is retrieved by making a GET request 39 | to sandbox.ihr.live:2019/config. 40 | 41 | ### Starting the database 42 | 43 | Once a dump has been found, its log is downloaded from the archive. If the log indicates 44 | that there are no errors, then the dump is downloaded. A docker container is then 45 | started that loads the dump into a neo4j database. The database is stored in a docker 46 | volume with the name data-MM-DD. Another container is then used to start the database 47 | using the data stored in data-MM-YY. It binds its internal neo4j 7474 and 7687 ports to 48 | the external ones that contain the dump's date. 49 | 50 | If a container is already running for this date, it and its data volume are deleted, and 51 | a new one is created from the downloaded dump data. 52 | 53 | If there was already an active database, it becomes the previous database. The current 54 | previous database container is stopped, and its data volume is deleted. 55 | -------------------------------------------------------------------------------- /basic/items.csv: -------------------------------------------------------------------------------- 1 | #label, description, aliases, statements 2 | #basic property, "Fundamental property for this wikibase",, 3 | #basic item, "Fundamental item for this wikibase",, 4 | autonomous system, "network or group of networks that has a unified routing policy", AS, 5 | organization, "social entity (not necessarily commercial) uniting people into a structured group managing shared means to meet some needs, or to pursue collective goals", org|organisation|organizations|organisations, 6 | Internet exchange point, "physical infrastructure through which Internet service providers (ISPs) and content delivery networks (CDNs) exchange Internet traffic between their networks", ix|IX|ixp|IXP|exchange point|internet exchange, 7 | country, "sovereign state of this item",sovereign state|state|land|host country, 8 | regional Internet registry, "organization that manages the allocation and registration of Internet number resources within a region of the world", RIR,instance of:organization 9 | RIPE NCC, "regional Internet registry for Europe, West Asia, and the former USSR", RIPE,instance of:regional Internet registry 10 | APNIC, "regional Internet address registry (RIR) for the Asia-Pacific region", , instance of:regional Internet registry 11 | ARIN, "regional Internet Registry for Canada, the United States, and many Caribbean and North Atlantic islands", , instance of:regional Internet registry 12 | LACNIC, "regional Internet registry for the Latin American and Caribbean regions" , ,instance of:regional Internet registry 13 | AFRINIC, "regional Internet registry for Africa", , instance of:regional Internet registry 14 | computer network protocol, "communications protocol used by data networks",protocol|network protocol|data network protocol, 15 | IPv4, "Internet Protocol version 4",ipv4,instance of:computer network protocol 16 | IPv6, "Internet Protocol version 6",ipv6,instance of:computer network protocol 17 | TCP, "Transmission Control Protocol",tcp,instance of:computer network protocol 18 | UDP, "User Datagram Protocol",udp,instance of:computer network protocol 19 | ICMP, "Internet Control Message Protocol",icmp,instance of:computer network protocol 20 | IP address, "numerical label used to identify a network interface on an IP network",, 21 | IP routing prefix, "part of an IP address that identifies a network often expressed using Classless Inter-Domain Routing (CIDR) notation.", prefix|subnet|IP prefix|address prefix|network prefix,implements:IPv4 22 | PeeringDB, "PeeringDB is a freely available, user-maintained, database of networks, and the go-to location for interconnection data.",peeringdb|peeringDB,instance of:organization 23 | Internet exchange point, "An Internet exchange point is the physical infrastructure through which Internet service providers and content delivery networks (CDNs) exchange Internet traffic between their networks",IXP|IX|ix|ixp, 24 | MANRS, "MANRS is an Internet Society-supported activity aimed at securing global Internet routing",Mutually Agreed Norms for Routing Security,instance of:organization 25 | CAIDA, "Center for Applied Internet Data Analysis",,instance of:organization 26 | Internet Health Report, "IIJ's Internet Health Report",,instance of:organization 27 | Route Views, "project founded by Advanced Network Technology Center at the University of Oregon to allow Internet users to view global Border Gateway Protocol routing information",routeviews|Routeviews|RouteViews,instance of:organization 28 | traffic statistics, "URL for public traffic data",traffic data|statistics page, 29 | route server,"BGP route server is a feature designed for internet exchange (IX) operators that provides an alternative to full eBGP mesh peering among the service providers who have a presence at the IX",routeserver|rs, 30 | domain name, "identification string that defines a realm of administrative autonomy, authority or control within the Internet",, 31 | peering LAN, "LAN used for exchanging traffic between multiple organizations, usually managed by an IXP", ,instance of:IP routing prefix 32 | -------------------------------------------------------------------------------- /caddy.template.json: -------------------------------------------------------------------------------- 1 | { 2 | "apps": { 3 | "http": { 4 | "servers": { 5 | "srv0": { 6 | "listen": [":443"], 7 | "routes": [ 8 | { 9 | "match": [{ "host": ["ryan-bolt.ihr.live"] }], 10 | "handle": [ 11 | { 12 | "handler": "subroute", 13 | "routes": [ 14 | { 15 | "handle": [ 16 | { 17 | "handler": "reverse_proxy", 18 | "upstreams": [{ "dial": "sandbox.ihr.live:" }] 19 | } 20 | ] 21 | } 22 | ] 23 | } 24 | ], 25 | "terminal": true 26 | }, 27 | { 28 | "match": [{ "host": ["ryan-prev-bolt.ihr.live"] }], 29 | "handle": [ 30 | { 31 | "handler": "subroute", 32 | "routes": [ 33 | { 34 | "handle": [ 35 | { 36 | "handler": "reverse_proxy", 37 | "upstreams": [{ "dial": "sandbox.ihr.live:" }] 38 | } 39 | ] 40 | } 41 | ] 42 | } 43 | ], 44 | "terminal": true 45 | }, 46 | { 47 | "match": [{ "host": ["sandbox.ihr.live"] }], 48 | "handle": [ 49 | { 50 | "handler": "subroute", 51 | "routes": [ 52 | { 53 | "handle": [ 54 | { 55 | "handler": "reverse_proxy", 56 | "upstreams": [{ "dial": "ryan.ihr.live:" }] 57 | } 58 | ] 59 | } 60 | ] 61 | } 62 | ], 63 | "terminal": true 64 | }, 65 | { 66 | "match": [{ "host": ["ryan.ihr.live"] }], 67 | "handle": [ 68 | { 69 | "handler": "subroute", 70 | "routes": [ 71 | { 72 | "handle": [ 73 | { 74 | "handler": "reverse_proxy", 75 | "upstreams": [{ "dial": "sandbox.ihr.live:" }] 76 | } 77 | ] 78 | } 79 | ] 80 | } 81 | ], 82 | "terminal": true 83 | }, 84 | { 85 | "match": [{ "host": ["ryan-prev.ihr.live"] }], 86 | "handle": [ 87 | { 88 | "handler": "subroute", 89 | "routes": [ 90 | { 91 | "handle": [ 92 | { 93 | "handler": "reverse_proxy", 94 | "upstreams": [{ "dial": "sandbox.ihr.live:" }] 95 | } 96 | ] 97 | } 98 | ] 99 | } 100 | ], 101 | "terminal": true 102 | } 103 | ] 104 | } 105 | } 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | iyp_loader: 3 | image: neo4j/neo4j-admin:5.26.2-community-debian 4 | profiles: ["local", "public_tls", "public_notls"] 5 | user: "${uid}:${gid}" 6 | container_name: iyp_loader 7 | tty: true 8 | stdin_open: true 9 | volumes: 10 | - ./data:/data 11 | - ./dumps:/dumps 12 | command: neo4j-admin database load neo4j --from-path=/dumps --verbose 13 | 14 | iyp: 15 | image: neo4j:5.26.3 16 | profiles: ["local"] 17 | user: "${uid}:${gid}" 18 | container_name: iyp 19 | restart: unless-stopped 20 | ports: 21 | - 7474:7474 22 | - 7687:7687 23 | volumes: 24 | - ./data:/data 25 | environment: 26 | - NEO4J_AUTH=neo4j/password 27 | depends_on: 28 | iyp_loader: 29 | condition: service_completed_successfully 30 | 31 | iyp_readonly_tls: 32 | image: neo4j:5.26.3 33 | profiles: ["public_tls"] 34 | user: "${uid}:${gid}" 35 | container_name: iyp 36 | restart: unless-stopped 37 | ports: 38 | - 7473:7473 39 | - 7687:7687 40 | volumes: 41 | - ./data:/data 42 | - ./public/certificates:/ssl 43 | - ./public/conf_tls:/conf 44 | - ./public/logs:/logs 45 | environment: 46 | - NEO4J_AUTH=neo4j/password 47 | depends_on: 48 | iyp_loader: 49 | condition: service_completed_successfully 50 | 51 | iyp_readonly_notls: 52 | image: neo4j:5.26.3 53 | profiles: ["public_notls"] 54 | user: "${uid}:${gid}" 55 | container_name: iyp 56 | restart: unless-stopped 57 | ports: 58 | - 7474:7474 59 | - 7687:7687 60 | volumes: 61 | - ./data:/data 62 | - ./public/conf_notls:/conf 63 | - ./public/logs:/logs 64 | environment: 65 | - NEO4J_AUTH=neo4j/password 66 | depends_on: 67 | iyp_loader: 68 | condition: service_completed_successfully 69 | 70 | caddy: 71 | image: caddy:latest 72 | profiles: ["caddy"] 73 | user: "${uid}:${gid}" 74 | container_name: caddy 75 | restart: unless-stopped 76 | ports: 77 | - "80:80" 78 | - "443:443" 79 | - "2019:2019" 80 | environment: 81 | - CADDY_ADMIN=0.0.0.0:2019 82 | volumes: 83 | - ./site:/srv 84 | - caddy_data:/data 85 | - caddy_config:/config 86 | command: /usr/bin/caddy run --resume 87 | 88 | volumes: 89 | caddy_data: 90 | caddy_config: 91 | -------------------------------------------------------------------------------- /documentation/README.md: -------------------------------------------------------------------------------- 1 | # IYP documentation 2 | 3 | ## Ontology 4 | 5 | The list of node and relationship types defined for IYP are available at: 6 | 7 | - [Node types](./node-types.md) 8 | - [Relationship types](./relationship-types.md) 9 | 10 | ## Data sources 11 | 12 | The list of all datasets imported in IYP is available [here](data-sources.md). 13 | The dataset licenses are available the [acknowledgments](../ACKNOWLEDGMENTS.md). 14 | 15 | ## Gallery 16 | 17 | The [IYP gallery](./gallery.md) provides example queries to help users browse the 18 | database. 19 | 20 | ## Add new datasets 21 | 22 | ### Propose a new dataset 23 | 24 | Have an idea for a dataset that should be integrated into IYP? Feel free to propose it 25 | by opening a new 26 | [discussion](https://github.com/InternetHealthReport/internet-yellow-pages/discussions). 27 | You should describe the dataset, why it is potentially useful, and, if possible, provide 28 | some initial idea for modeling the data. 29 | 30 | The discussion is used to decide if we want to integrate the dataset and how to model 31 | it. So feel free to propose a dataset even if you have no concrete model in mind. 32 | 33 | ### Import a new dataset 34 | 35 | If it was decided that the dataset should be integrated into IYP, we will convert the 36 | discussion into a [GitHub 37 | issue](https://github.com/InternetHealthReport/internet-yellow-pages/issues). At this 38 | stage it is open to anyone who wants to implement a crawler for the dataset. 39 | 40 | For a detailed description on how to write your first crawler and contribute to IYP take 41 | a look at the [IHR contributing guidelines](../CONTRIBUTING.md) and the [crawler 42 | instructions](writing-a-crawler.md). 43 | -------------------------------------------------------------------------------- /documentation/assets/gallery/as2497country.svg: -------------------------------------------------------------------------------- 1 | Neo4j Graph VisualizationCreated using Neo4j (http://www.neo4j.com/)COUNTRYCOUNTRYPOPULATION 2497 JP -------------------------------------------------------------------------------- /documentation/database-management.md: -------------------------------------------------------------------------------- 1 | # Advanced database commands 2 | 3 | Here we describe some additional useful commands to handle IYP dumps. 4 | 5 | ## Update existing database 6 | 7 | If you [setup the database](../README.md#set-up-iyp) you can load a new dump without 8 | recreating the Docker containers. Place the new dump at `dumps/neo4j.dump`, delete the 9 | existing database and run only the loader again: 10 | 11 | ```bash 12 | # If the database is running, stop it. 13 | # docker stop iyp 14 | # Delete the existing database 15 | rm -r data/* 16 | # Run the loader 17 | docker start -i iyp_loader 18 | # Start the database. 19 | docker start iyp 20 | ``` 21 | 22 | ## Save modified database 23 | 24 | If you did changes to the database and want to dump the contents into a file, you can 25 | use the loader for this. For example, to dump the database into a folder called 26 | `backups`: 27 | 28 | ```bash 29 | # Directory has to exist or it will be created as root by Docker. 30 | mkdir -p backups 31 | uid="$(id -u)" gid="$(id -g)" docker compose run --rm -i -v "$PWD/backups:/backups" iyp_loader neo4j-admin database dump neo4j --to-path=/backups --verbose --overwrite-destination 32 | ``` 33 | 34 | This will create a file called `neo4j.dump` in the `backups` folder. **Note that this 35 | will also overwrite this file if it exists!** 36 | 37 | ## View Neo4j logs 38 | 39 | To view the logs of the Neo4j container, use the following command: 40 | 41 | ```bash 42 | docker logs -f iyp 43 | ``` 44 | 45 | ## Create a new dump from scratch 46 | 47 | **Enabling all crawlers will download a lot of data and take multiple days to create a 48 | dump.** 49 | 50 | Clone this repository: 51 | 52 | ```bash 53 | git clone https://github.com/InternetHealthReport/internet-yellow-pages.git 54 | cd internet-yellow-pages 55 | ``` 56 | 57 | Create Python environment and install Python libraries: 58 | 59 | ```bash 60 | python3 -m venv --upgrade-deps .venv 61 | source .venv/bin/activate 62 | pip install -r requirements.txt 63 | ``` 64 | 65 | Create a configuration file from the example file and add API keys. **Note that some 66 | crawlers do not work without credentials.** 67 | 68 | ```bash 69 | cp config.json.example config.json 70 | # Edit as needed 71 | ``` 72 | 73 | Create and populate a new database: 74 | 75 | ```bash 76 | python3 create_db.py 77 | ``` 78 | -------------------------------------------------------------------------------- /documentation/gallery.md: -------------------------------------------------------------------------------- 1 | # IYP Gallery 2 | 3 | Below are examples queries that you can copy/paste in [Neo4j browser](https://iyp.iijlab.net/iyp/browser/?dbms=iyp-bolt.iijlab.net:443). 4 | 5 | Querying the IYP database requires to be familiar with: 6 | - Cypher, Neo4j's query langage https://neo4j.com/docs/getting-started/current/cypher-intro/ 7 | - Basic networking knowledge (IP, prefixes, ASes, etc..) 8 | - [IYP ontology](./README.md) 9 | 10 | ### Names for AS2497 11 | Find 'Name' nodes directly connected to the node corresponding to AS2497. 12 | ```cypher 13 | MATCH (a:AS {asn:2497})--(n:Name) RETURN a,n 14 | ``` 15 | ![Names for AS2497](/documentation/assets/gallery/as2497names.svg) 16 | 17 | 18 | ### All nodes related to 8.8.8.0/24 19 | Find nodes of any type that are connected to the node corresponding to prefix 20 | 8.8.8.0/24. 21 | ```cypher 22 | MATCH (gdns:Prefix {prefix:'8.8.8.0/24'})--(neighbor) 23 | RETURN gdns, neighbor 24 | ``` 25 | ![All nodes related to 8.8.8.0/24](/documentation/assets/gallery/prefixAllRelated.svg) 26 | 27 | 28 | ### Country code of AS2497 in delegated files 29 | Here we search for a country node directly connected to AS2497's node and that 30 | comes from NRO's delegated stats. 31 | ```cypher 32 | MATCH (iij:AS {asn:2497})-[{reference_name:'nro.delegated_stats'}]-(cc:Country) 33 | RETURN iij, cc 34 | ``` 35 | ![Country code of AS2497 in delegated files](/documentation/assets/gallery/as2497country.svg) 36 | 37 | 38 | ### Countries of IXPs where AS2497 is present 39 | Starting from the node corresponding to AS2497, find IXPs where AS2497 is member 40 | of, and then the country corresponding to each IXP. 41 | ```cypher 42 | MATCH (iij:AS {asn:2497})-[:MEMBER_OF]-(ix:IXP)--(cc:Country) 43 | RETURN iij, ix, cc 44 | ``` 45 | ![Countries of IXPs where AS2497 is present](/documentation/assets/gallery/as2497ixpCountry.svg) 46 | 47 | 48 | ### Top domain names hosted by AS2497 49 | Select domain names in top 50k rankings that resolves to an IP originated by 50 | AS2497. 51 | ```cypher 52 | MATCH (:Ranking)-[r:RANK]-(dn:DomainName)-[:PART_OF]-(hn:HostName)-[:RESOLVES_TO]-(ip:IP)--(pfx:Prefix)-[:ORIGINATE]-(iij:AS {asn:2497}) 53 | WHERE r.rank < 50000 AND dn.name = hn.name 54 | RETURN hn, ip, pfx, iij 55 | ``` 56 | ![Top domain names hosted by AS2497](/documentation/assets/gallery/as2497domainNames.svg) 57 | 58 | 59 | ### ASes hosting top domain names in Japan 60 | From the top 10k domain names select domain names that ends with '.jp', the 61 | corresponding IP, prefix, and ASN. 62 | ```cypher 63 | MATCH (:Ranking)-[r:RANK]-(dn:DomainName)-[:PART_OF]-(hn:HostName)-[rt:RESOLVES_TO]-(ip:IP)-[po:PART_OF]-(pfx:Prefix)-[o:ORIGINATE]-(net:AS) 64 | WHERE dn.name ENDS WITH '.jp' AND r.rank<10000 AND dn.name = hn.name 65 | RETURN hn, ip, pfx, net, rt, po, o 66 | ``` 67 | ![ASes hosting top domain names in Japan](/documentation/assets/gallery/top10kJapanAS.svg) 68 | 69 | ### Topology for top ASes in Iran 70 | Select IHR's top 20 ASes in Iran and show how they are connected to each other using AS relationships. 71 | ```cypher 72 | MATCH (a:AS)-[ra:RANK]->(:Ranking {name: 'IHR country ranking: Total AS (IR)'})<-[rb:RANK]-(b:AS)-[p:PEERS_WITH]-(a) 73 | WHERE ra.rank < 20 AND rb.rank < 20 AND p.rel = 0 74 | RETURN a, p, b 75 | ``` 76 | ![Top ASes connecting Iran](/documentation/assets/gallery/top20IranAS.svg) 77 | 78 | ### Topology for AS2501's dependencies 79 | Select AS dependencies for AS2501 and find the shortest PEERS_WITH relationship to these ASes. 80 | ```cypher 81 | MATCH (a:AS {asn:2501})-[h:DEPENDS_ON {af:4}]->(d:AS) 82 | WITH a, COLLECT(DISTINCT d) AS dependencies 83 | UNWIND dependencies as d 84 | MATCH p = allShortestPaths((a)-[:PEERS_WITH*]-(d)) 85 | WHERE a.asn <> d.asn AND all(r IN relationships(p) WHERE r.af = 4) AND all(n IN nodes(p) WHERE n IN dependencies) 86 | RETURN p 87 | ``` 88 | ![Dependencies for AS2501](/documentation/assets/gallery/as2501dependencies.svg) 89 | 90 | ### List of IPs for RIPE RIS full feed peers (more than 800k prefixes) 91 | 92 | ```cypher 93 | MATCH (n:BGPCollector)-[p:PEERS_WITH]-(a:AS) 94 | WHERE n.project = 'riperis' AND p.num_v4_pfxs > 800000 95 | RETURN n.name, COUNT(DISTINCT p.ip) AS nb_full, COLLECT(DISTINCT p.ip) AS ips_full 96 | ``` 97 | 98 | ### Active RIPE Atlas probes for the top 5 ISPs in Japan 99 | 100 | ```cypher 101 | MATCH (pb:AtlasProbe)-[:LOCATED_IN]-(a:AS)-[pop:POPULATION]-(c:Country) 102 | WHERE c.country_code = 'JP' AND pb.status_name = 'Connected' AND pop.rank <= 5 103 | RETURN pop.rank, a.asn, COLLECT(pb.id) AS probe_ids ORDER BY pop.rank 104 | ``` 105 | -------------------------------------------------------------------------------- /documentation/writing-a-crawler.md: -------------------------------------------------------------------------------- 1 | # How to write your first crawler 2 | 3 | To import a new dataset into IYP, you should write a crawler for that dataset. The main 4 | tasks of a crawler are to fetch data, parse it, model it with IYP ontology, and push it 5 | to the IYP database. Most of these tasks are assisted by the [IYP python 6 | library](../iyp/__init__.py). See the [example 7 | crawler](../iyp/crawlers/example/crawler.py) or [existing crawlers](../iyp/crawlers/) 8 | for getting started. 9 | See also the [IHR contributing guidelines](../CONTRIBUTING.md) and [best practices for 10 | writing crawlers](crawler-best-practices.md). 11 | 12 | ## README 13 | 14 | Each crawler should be accompanied by a README.md file. This is the main documentation 15 | for the crawler, it should contain: 16 | 17 | - a short description of the dataset, 18 | - any specificities related to the way the data is imported (e.g., time span, data cleaning), 19 | - examples of how the data is modeled, 20 | - dependencies to other crawlers (e.g., if the crawler requires data from another one). 21 | 22 | ## Adding a crawler to IYP main branch 23 | 24 | If you wish your crawler to be part of the IYP weekly dumps, you can submit a [Pull 25 | Request](https://github.com/InternetHealthReport/internet-yellow-pages/pulls) to include 26 | the crawler to IYP's GitHub repository main branch. 27 | 28 | Along with the Python code and README, the addition of new datasets should also be 29 | reflected in the following files: 30 | 31 | - the list of [imported datasets](./data-sources.md), 32 | - the [IYP acknowledgments](../ACKNOWLEDGMENTS.md) file should list the license of all imported dataset. 33 | 34 | Changes to the ontology should be discussed in advance, either on [GitHub 35 | discussion](https://github.com/InternetHealthReport/internet-yellow-pages/discussions) 36 | or by reaching out to [IYP maintainers](mailto:iyp@ihr.live), so that a consensus is reached 37 | before the ontology is updated. 38 | **Any change to the ontology should be reflected in the documentation:** ([Node 39 | types](./node-types.md) and [Relationship types](./relationship-types.md)). 40 | 41 | You can also consider adding example queries to the [IYP gallery](./gallery.md), and 42 | organizations providing data to the [IYP frontpage](https://iyp.iijlab.net/). 43 | -------------------------------------------------------------------------------- /iyp/crawlers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternetHealthReport/internet-yellow-pages/c79ced8783c73f282d273a886f4aa5cf8018fdd9/iyp/crawlers/__init__.py -------------------------------------------------------------------------------- /iyp/crawlers/alice_lg/README.md: -------------------------------------------------------------------------------- 1 | # Alice-LG -- https://github.com/alice-lg/alice-lg 2 | 3 | Alice-LG is a BGP looking glass which gets its data from external APIs. 4 | 5 | It is used by some large IXPs (e.g., DE-CIX, LINX, AMS-IX) and IYP imports membership 6 | information by reading the route server neighbors. 7 | 8 | The crawler *can* also import the received routes of all neighbors, however testing has 9 | shown that this takes an unreasonable amount of time for most IXPs due to the tiny 10 | pagination size (250 routes per page). Therefore this functionality is disabled by default. 11 | 12 | List of supported IXPs: 13 | 14 | - AMS-IX (`amsix.py`) 15 | - BCIX (`bcix.py`) 16 | - DE-CIX (`decix.py`) 17 | - IX.br (`ixbr.py`) 18 | - LINX (`linx.py`) 19 | - Megaport (`megaport.py`) 20 | - Netnod (`netnod.py`) 21 | 22 | ## Graph representation 23 | 24 | ```Cypher 25 | (:AS {asn: 2497})-[:MEMBER_OF {address: '80.81.193.136', routeserver_id: 'rs1_fra_ipv4'}]->(:IXP {name: 'DE-CIX Frankfurt'}) 26 | // Routes are not crawled by default 27 | (:AS {asn: 3333})-[:ORIGINATE {neighbor_id: 'pb_0280_as20562', routeserver_id: 'rs01-bcix-v4'}]->(:BGPPrefix {prefix: '193.0.0.0/21'}) 28 | ``` 29 | 30 | There is the possibility of multiple relationships between the same node. However, these 31 | contain different information, e.g., a member is present with multiple interfaces 32 | (`address`) or the information is seen by different route servers (`routeserver_id`). 33 | Similarly, a route can be seen via multiple neighbors (`neighbor_id`) or different route 34 | servers (`routeserver_id`). 35 | 36 | ## Dependence 37 | 38 | This crawler requires peering LAN information to map the neighbor IP to an IXP. 39 | Therefore, it should be run after crawlers that create 40 | 41 | ```Cypher 42 | (:PeeringLAN)-[:MANAGED_BY]->(:IXP) 43 | ``` 44 | 45 | relationships: 46 | 47 | - `iyp.crawlers.peeringdb.ix` 48 | -------------------------------------------------------------------------------- /iyp/crawlers/alice_lg/amsix.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.alice_lg import Crawler 6 | 7 | ORG = 'Alice-LG' 8 | URL = 'https://lg.ams-ix.net/api/v1/' 9 | NAME = 'alice_lg.amsix' 10 | 11 | 12 | def main() -> None: 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--unit-test', action='store_true') 15 | args = parser.parse_args() 16 | 17 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 18 | logging.basicConfig( 19 | format=FORMAT, 20 | filename='log/' + NAME + '.log', 21 | level=logging.INFO, 22 | datefmt='%Y-%m-%d %H:%M:%S' 23 | ) 24 | 25 | logging.info(f'Started: {sys.argv}') 26 | 27 | crawler = Crawler(ORG, URL, NAME) 28 | if args.unit_test: 29 | crawler.unit_test() 30 | else: 31 | crawler.run() 32 | crawler.close() 33 | logging.info(f'Finished: {sys.argv}') 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | sys.exit(0) 39 | -------------------------------------------------------------------------------- /iyp/crawlers/alice_lg/bcix.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.alice_lg import Crawler 6 | 7 | ORG = 'Alice-LG' 8 | URL = 'https://lg.bcix.de/api/v1/' 9 | NAME = 'alice_lg.bcix' 10 | 11 | 12 | def main() -> None: 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--unit-test', action='store_true') 15 | args = parser.parse_args() 16 | 17 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 18 | logging.basicConfig( 19 | format=FORMAT, 20 | filename='log/' + NAME + '.log', 21 | level=logging.INFO, 22 | datefmt='%Y-%m-%d %H:%M:%S' 23 | ) 24 | 25 | logging.info(f'Started: {sys.argv}') 26 | 27 | crawler = Crawler(ORG, URL, NAME) 28 | if args.unit_test: 29 | crawler.unit_test() 30 | else: 31 | crawler.run() 32 | crawler.close() 33 | logging.info(f'Finished: {sys.argv}') 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | sys.exit(0) 39 | -------------------------------------------------------------------------------- /iyp/crawlers/alice_lg/decix.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.alice_lg import Crawler 6 | 7 | ORG = 'Alice-LG' 8 | URL = 'https://lg.de-cix.net/api/v1/' 9 | NAME = 'alice_lg.decix' 10 | 11 | 12 | def main() -> None: 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--unit-test', action='store_true') 15 | args = parser.parse_args() 16 | 17 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 18 | logging.basicConfig( 19 | format=FORMAT, 20 | filename='log/' + NAME + '.log', 21 | level=logging.INFO, 22 | datefmt='%Y-%m-%d %H:%M:%S' 23 | ) 24 | 25 | logging.info(f'Started: {sys.argv}') 26 | 27 | crawler = Crawler(ORG, URL, NAME) 28 | if args.unit_test: 29 | crawler.unit_test() 30 | else: 31 | crawler.run() 32 | crawler.close() 33 | logging.info(f'Finished: {sys.argv}') 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | sys.exit(0) 39 | -------------------------------------------------------------------------------- /iyp/crawlers/alice_lg/ixbr.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.alice_lg import Crawler 6 | 7 | ORG = 'Alice-LG' 8 | URL = 'https://lg.ix.br/api/v1/' 9 | NAME = 'alice_lg.ixbr' 10 | 11 | 12 | def main() -> None: 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--unit-test', action='store_true') 15 | args = parser.parse_args() 16 | 17 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 18 | logging.basicConfig( 19 | format=FORMAT, 20 | filename='log/' + NAME + '.log', 21 | level=logging.INFO, 22 | datefmt='%Y-%m-%d %H:%M:%S' 23 | ) 24 | 25 | logging.info(f'Started: {sys.argv}') 26 | 27 | crawler = Crawler(ORG, URL, NAME) 28 | if args.unit_test: 29 | crawler.unit_test() 30 | else: 31 | crawler.run() 32 | crawler.close() 33 | logging.info(f'Finished: {sys.argv}') 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | sys.exit(0) 39 | -------------------------------------------------------------------------------- /iyp/crawlers/alice_lg/linx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.alice_lg import Crawler 6 | 7 | ORG = 'Alice-LG' 8 | URL = 'https://alice-rs.linx.net/api/v1/' 9 | NAME = 'alice_lg.linx' 10 | 11 | 12 | def main() -> None: 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--unit-test', action='store_true') 15 | args = parser.parse_args() 16 | 17 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 18 | logging.basicConfig( 19 | format=FORMAT, 20 | filename='log/' + NAME + '.log', 21 | level=logging.INFO, 22 | datefmt='%Y-%m-%d %H:%M:%S' 23 | ) 24 | 25 | logging.info(f'Started: {sys.argv}') 26 | 27 | crawler = Crawler(ORG, URL, NAME) 28 | if args.unit_test: 29 | crawler.unit_test() 30 | else: 31 | crawler.run() 32 | crawler.close() 33 | logging.info(f'Finished: {sys.argv}') 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | sys.exit(0) 39 | -------------------------------------------------------------------------------- /iyp/crawlers/alice_lg/megaport.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.alice_lg import Crawler 6 | 7 | ORG = 'Alice-LG' 8 | URL = 'https://lg.megaport.com/api/v1/' 9 | NAME = 'alice_lg.megaport' 10 | 11 | 12 | def main() -> None: 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--unit-test', action='store_true') 15 | args = parser.parse_args() 16 | 17 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 18 | logging.basicConfig( 19 | format=FORMAT, 20 | filename='log/' + NAME + '.log', 21 | level=logging.INFO, 22 | datefmt='%Y-%m-%d %H:%M:%S' 23 | ) 24 | 25 | logging.info(f'Started: {sys.argv}') 26 | 27 | crawler = Crawler(ORG, URL, NAME) 28 | if args.unit_test: 29 | crawler.unit_test() 30 | else: 31 | crawler.run() 32 | crawler.close() 33 | logging.info(f'Finished: {sys.argv}') 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | sys.exit(0) 39 | -------------------------------------------------------------------------------- /iyp/crawlers/alice_lg/netnod.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.alice_lg import Crawler 6 | 7 | ORG = 'Alice-LG' 8 | URL = 'https://lg.netnod.se/api/v1/' 9 | NAME = 'alice_lg.netnod' 10 | 11 | 12 | def main() -> None: 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--unit-test', action='store_true') 15 | args = parser.parse_args() 16 | 17 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 18 | logging.basicConfig( 19 | format=FORMAT, 20 | filename='log/' + NAME + '.log', 21 | level=logging.INFO, 22 | datefmt='%Y-%m-%d %H:%M:%S' 23 | ) 24 | 25 | logging.info(f'Started: {sys.argv}') 26 | 27 | crawler = Crawler(ORG, URL, NAME) 28 | if args.unit_test: 29 | crawler.unit_test() 30 | else: 31 | crawler.run() 32 | crawler.close() 33 | logging.info(f'Finished: {sys.argv}') 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | sys.exit(0) 39 | -------------------------------------------------------------------------------- /iyp/crawlers/apnic/README.md: -------------------------------------------------------------------------------- 1 | # APNIC population estimates -- https://labs.apnic.net/ 2 | 3 | Population per AS estimated using an advertisement on Youtube. This dataset 4 | provides: 5 | - the percentage of the population per country/AS 6 | - AS ranking in terms of population 7 | - AS names 8 | 9 | 10 | ## Graph representation 11 | 12 | ### Population 13 | Connect AS to country nodes with a 'population' relationship representing the 14 | percentage of the country's population hosted by the AS. 15 | 16 | ``` 17 | (:AS {asn:2516})-[:POPULATION {percent:19.3}]-(:Country {country_code:'JP'}) 18 | ``` 19 | 20 | 21 | ### Country 22 | Connect AS to country nodes, meaning that the AS serves people in that country. 23 | 24 | ``` 25 | (:AS)-[:COUNTRY]-(:Country) 26 | ``` 27 | 28 | ### Ranking 29 | Connect ASes to ranking nodes which are also connected to a country. Meaning 30 | that an AS is ranked for a certain country in terms of population. 31 | For example: 32 | ``` 33 | (:AS {asn:2516})-[:RANK {rank:1}]-(:Ranking)--(:Country {country_code:'JP'}) 34 | ``` 35 | 36 | ### AS name 37 | Connect AS to names nodes, providing the name of ranked ASes. 38 | For example: 39 | ``` 40 | (:AS {asn:2497})-[:NAME]-(:Name {name:'IIJ'}) 41 | ``` 42 | 43 | 44 | ## Dependence 45 | 46 | This crawler is not depending on other crawlers. 47 | -------------------------------------------------------------------------------- /iyp/crawlers/bgpkit/README.md: -------------------------------------------------------------------------------- 1 | # BGPKIT data -- https://data.bgpkit.com 2 | 3 | Data inferred from RouteViews and RIPE RIS BGP data, including: 4 | - AS relationship 5 | - prefix to ASN mappings 6 | - BGP collectors' peers stats 7 | 8 | 9 | ## Graph representation 10 | 11 | ### AS relationship 12 | Connect ASes that are peering with each other. The 'rel' attribute and the link 13 | direction gives the type of relationship between the two ASes: 14 | - rel=0: peer to peer relationship 15 | - rel=1: provider/customer relationship. A->B means A is the provider of B. 16 | 17 | ``` 18 | (:AS {asn:2497})-[:PEERS_WITH {rel: 0, af: 4}]-(:AS {asn:2914}) 19 | ``` 20 | 21 | 22 | ### Peers stats 23 | Connect AS nodes to BGP route collector nodes, meaning that an AS peers with 24 | a route collector hence participating in the RIS or RouteViews projects. 25 | 26 | ``` 27 | (:AS {asn:2497})-[:PEERS_WITH]-(:BGPCollector {project: 'riperis', name:'rrc06'}) 28 | ``` 29 | 30 | ### Prefix to ASN 31 | Connect AS nodes to prefix nodes representing the prefixes originated by an AS. 32 | For example: 33 | ``` 34 | (:AS {asn:2497})-[:ORIGINATE]-(:BGPPrefix {prefix: '101.128.128.0/17'}) 35 | ``` 36 | 37 | ## Dependence 38 | 39 | This crawler is not depending on other crawlers. 40 | -------------------------------------------------------------------------------- /iyp/crawlers/bgpkit/__init__.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import json 3 | 4 | import requests 5 | 6 | from iyp import BaseCrawler, set_modification_time_from_last_modified_header 7 | 8 | 9 | class AS2RelCrawler(BaseCrawler): 10 | def __init__(self, organization, url, name, af): 11 | """Initialization: set the address family attribute (af)""" 12 | super().__init__(organization, url, name) 13 | self.af = af 14 | self.reference['reference_url_info'] = 'https://data.bgpkit.com/as2rel/README.txt' 15 | 16 | def run(self): 17 | """Fetch the AS relationship file from BGPKIT website and process lines one by 18 | one.""" 19 | 20 | req = requests.get(self.url, stream=True) 21 | req.raise_for_status() 22 | 23 | set_modification_time_from_last_modified_header(self.reference, req) 24 | 25 | rels = [] 26 | asns = set() 27 | 28 | # Collect all ASNs 29 | for rel in json.load(bz2.open(req.raw)): 30 | asns.add(rel['asn1']) 31 | asns.add(rel['asn2']) 32 | rels.append(rel) 33 | 34 | # get ASNs IDs 35 | self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns) 36 | 37 | # Compute links 38 | links = [] 39 | for rel in rels: 40 | as1_qid = self.asn_id[rel['asn1']] 41 | as2_qid = self.asn_id[rel['asn2']] 42 | rel['af'] = self.af 43 | 44 | links.append({'src_id': as1_qid, 'dst_id': as2_qid, 'props': [self.reference, rel]}) 45 | 46 | # Push all links to IYP 47 | self.iyp.batch_add_links('PEERS_WITH', links) 48 | 49 | def unit_test(self): 50 | return super().unit_test(['PEERS_WITH']) 51 | -------------------------------------------------------------------------------- /iyp/crawlers/bgpkit/as2rel_v4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.bgpkit import AS2RelCrawler 6 | 7 | URL = 'https://data.bgpkit.com/as2rel/as2rel-v4-latest.json.bz2' 8 | ORG = 'BGPKIT' 9 | NAME = 'bgpkit.as2rel_v4' 10 | 11 | AF = 4 12 | 13 | 14 | class Crawler(AS2RelCrawler): 15 | def __init__(self, organization, url, name): 16 | super().__init__(organization, url, name, AF) 17 | 18 | 19 | def main() -> None: 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--unit-test', action='store_true') 22 | args = parser.parse_args() 23 | 24 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 25 | logging.basicConfig( 26 | format=FORMAT, 27 | filename='log/' + NAME + '.log', 28 | level=logging.INFO, 29 | datefmt='%Y-%m-%d %H:%M:%S' 30 | ) 31 | 32 | logging.info(f'Started: {sys.argv}') 33 | 34 | crawler = Crawler(ORG, URL, NAME) 35 | if args.unit_test: 36 | crawler.unit_test() 37 | else: 38 | crawler.run() 39 | crawler.close() 40 | logging.info(f'Finished: {sys.argv}') 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | sys.exit(0) 46 | -------------------------------------------------------------------------------- /iyp/crawlers/bgpkit/as2rel_v6.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.bgpkit import AS2RelCrawler 6 | 7 | URL = 'https://data.bgpkit.com/as2rel/as2rel-v6-latest.json.bz2' 8 | ORG = 'BGPKIT' 9 | NAME = 'bgpkit.as2rel_v6' 10 | 11 | AF = 6 12 | 13 | 14 | class Crawler(AS2RelCrawler): 15 | def __init__(self, organization, url, name): 16 | super().__init__(organization, url, name, AF) 17 | 18 | 19 | def main() -> None: 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--unit-test', action='store_true') 22 | args = parser.parse_args() 23 | 24 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 25 | logging.basicConfig( 26 | format=FORMAT, 27 | filename='log/' + NAME + '.log', 28 | level=logging.INFO, 29 | datefmt='%Y-%m-%d %H:%M:%S' 30 | ) 31 | 32 | logging.info(f'Started: {sys.argv}') 33 | 34 | crawler = Crawler(ORG, URL, NAME) 35 | if args.unit_test: 36 | crawler.unit_test() 37 | else: 38 | crawler.run() 39 | crawler.close() 40 | logging.info(f'Finished: {sys.argv}') 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | sys.exit(0) 46 | -------------------------------------------------------------------------------- /iyp/crawlers/bgpkit/peerstats.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import bz2 3 | import json 4 | import logging 5 | import sys 6 | from datetime import datetime, time, timedelta, timezone 7 | 8 | import requests 9 | 10 | from iyp import BaseCrawler 11 | 12 | MAIN_PAGE = 'https://data.bgpkit.com/peer-stats/' 13 | URL = 'https://data.bgpkit.com/peer-stats/{collector}/{year}/{month:02d}/peer-stats_{collector}_{year}-{month:02d}-{day:02d}_{epoch}.bz2' # noqa: E501 14 | ORG = 'BGPKIT' 15 | NAME = 'bgpkit.peerstats' 16 | 17 | 18 | class Crawler(BaseCrawler): 19 | def __init__(self, organization, url, name): 20 | super().__init__(organization, url, name) 21 | self.reference['reference_url_info'] = 'https://data.bgpkit.com/peer-stats/README.md' 22 | 23 | def run(self): 24 | """Fetch peer stats for each collector.""" 25 | 26 | req = requests.get(MAIN_PAGE) 27 | req.raise_for_status() 28 | 29 | # Find all collectors 30 | collectors = [] 31 | for line in req.text.splitlines(): 32 | if line.strip().startswith('') and line.endswith('/'): 33 | collectors.append(line.partition('>')[2].partition('/')[0]) 34 | 35 | # Find latest date 36 | prev_day = datetime.combine(datetime.utcnow(), time.min, timezone.utc) 37 | self.now = None 38 | req = None 39 | trials = 0 40 | 41 | while (req is None or req.status_code != 200) and trials < 7: 42 | self.now = prev_day 43 | # Check if today's data is available 44 | url = URL.format(collector='rrc10', year=self.now.year, 45 | month=self.now.month, day=self.now.day, 46 | epoch=int(self.now.timestamp())) 47 | req = requests.head(url) 48 | 49 | prev_day -= timedelta(days=1) 50 | logging.warning("Today's data not yet available!") 51 | 52 | self.reference['reference_time_modification'] = self.now 53 | for collector in collectors: 54 | url = URL.format(collector=collector, year=self.now.year, 55 | month=self.now.month, day=self.now.day, 56 | epoch=int(self.now.timestamp())) 57 | 58 | req = requests.get(url, stream=True) 59 | if req.status_code != 200: 60 | logging.warning(f'Data not available for {collector}') 61 | continue 62 | 63 | # keep track of collector and reference url 64 | stats = json.load(bz2.open(req.raw)) 65 | collector_qid = self.iyp.get_node( 66 | 'BGPCollector', 67 | {'name': stats['collector'], 'project': stats['project']} 68 | ) 69 | self.reference['reference_url_data'] = url 70 | 71 | asns = set() 72 | 73 | # Collect all ASNs and names 74 | for peer in stats['peers'].values(): 75 | asns.add(peer['asn']) 76 | 77 | # get ASNs' IDs 78 | self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False) 79 | 80 | # Compute links 81 | links = [] 82 | for peer in stats['peers'].values(): 83 | as_qid = self.asn_id[peer['asn']] 84 | links.append({'src_id': as_qid, 'dst_id': collector_qid, 85 | 'props': [self.reference, peer]}) # Set AS name 86 | 87 | # Push all links to IYP 88 | self.iyp.batch_add_links('PEERS_WITH', links) 89 | 90 | def unit_test(self): 91 | return super().unit_test(['PEERS_WITH']) 92 | 93 | 94 | def main() -> None: 95 | parser = argparse.ArgumentParser() 96 | parser.add_argument('--unit-test', action='store_true') 97 | args = parser.parse_args() 98 | 99 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 100 | logging.basicConfig( 101 | format=FORMAT, 102 | filename='log/' + NAME + '.log', 103 | level=logging.INFO, 104 | datefmt='%Y-%m-%d %H:%M:%S' 105 | ) 106 | 107 | logging.info(f'Started: {sys.argv}') 108 | 109 | crawler = Crawler(ORG, URL, NAME) 110 | if args.unit_test: 111 | crawler.unit_test() 112 | else: 113 | crawler.run() 114 | crawler.close() 115 | logging.info(f'Finished: {sys.argv}') 116 | 117 | 118 | if __name__ == '__main__': 119 | main() 120 | sys.exit(0) 121 | -------------------------------------------------------------------------------- /iyp/crawlers/bgpkit/pfx2asn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import bz2 3 | import json 4 | import logging 5 | import sys 6 | from ipaddress import ip_network 7 | 8 | import requests 9 | 10 | from iyp import BaseCrawler, set_modification_time_from_last_modified_header 11 | 12 | URL = 'https://data.bgpkit.com/pfx2as/pfx2as-latest.json.bz2' 13 | ORG = 'BGPKIT' 14 | NAME = 'bgpkit.pfx2asn' 15 | 16 | 17 | class Crawler(BaseCrawler): 18 | 19 | def run(self): 20 | """Fetch the prefix to ASN file from BGPKIT website and process lines one by 21 | one.""" 22 | 23 | req = requests.get(URL, stream=True) 24 | req.raise_for_status() 25 | 26 | set_modification_time_from_last_modified_header(self.reference, req) 27 | 28 | entries = [] 29 | asns = set() 30 | prefixes = set() 31 | 32 | for entry in json.load(bz2.open(req.raw)): 33 | try: 34 | prefix = ip_network(entry['prefix']).compressed 35 | except ValueError as e: 36 | logging.warning(f'Ignoring malformed prefix: "{entry["prefix"]}": {e}') 37 | continue 38 | entry['prefix'] = prefix 39 | prefixes.add(prefix) 40 | asns.add(entry['asn']) 41 | entries.append(entry) 42 | 43 | req.close() 44 | 45 | # get ASNs and prefixes IDs 46 | self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns) 47 | self.prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes, all=False) 48 | # Add the BGPPrefix label 49 | self.iyp.batch_add_node_label(list(self.prefix_id.values()), 'BGPPrefix') 50 | 51 | # Compute links 52 | links = [] 53 | for entry in entries: 54 | asn_qid = self.asn_id[entry['asn']] 55 | prefix_qid = self.prefix_id[entry['prefix']] 56 | 57 | links.append({'src_id': asn_qid, 'dst_id': prefix_qid, 'props': [self.reference, entry]}) 58 | 59 | # Push all links to IYP 60 | self.iyp.batch_add_links('ORIGINATE', links) 61 | 62 | def unit_test(self): 63 | return super().unit_test(['ORIGINATE']) 64 | 65 | 66 | def main() -> None: 67 | parser = argparse.ArgumentParser() 68 | parser.add_argument('--unit-test', action='store_true') 69 | args = parser.parse_args() 70 | 71 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 72 | logging.basicConfig( 73 | format=FORMAT, 74 | filename='log/' + NAME + '.log', 75 | level=logging.INFO, 76 | datefmt='%Y-%m-%d %H:%M:%S' 77 | ) 78 | 79 | logging.info(f'Started: {sys.argv}') 80 | 81 | crawler = Crawler(ORG, URL, NAME) 82 | if args.unit_test: 83 | crawler.unit_test() 84 | else: 85 | crawler.run() 86 | crawler.close() 87 | logging.info(f'Finished: {sys.argv}') 88 | 89 | 90 | if __name__ == '__main__': 91 | main() 92 | sys.exit(0) 93 | -------------------------------------------------------------------------------- /iyp/crawlers/bgptools/README.md: -------------------------------------------------------------------------------- 1 | # BGP.Tools -- https://bgp.tools 2 | 3 | Data collected by BGP.Tools, including: 4 | - AS names 5 | - AS tags 6 | - Anycast IPv4, and IPv6 prefixes 7 | 8 | 9 | ## Graph representation 10 | 11 | ### AS names 12 | Connect AS to names nodes, providing the name of an AS. 13 | For example: 14 | ``` 15 | (:AS {asn:2497})-[:NAME]-(:Name {name:'IIJ'}) 16 | ``` 17 | 18 | ### AS tags 19 | Connect AS to tag nodes meaning that an AS has been categorized according to the 20 | given tag. 21 | ``` 22 | (:AS {asn:2497})-[:CATEGORIZED]-(:Tag {label: 'Internet Critical Infra'}) 23 | ``` 24 | 25 | ### Anycast IPv4 and IPv6 prefixes 26 | Connect Prefix to tag node meaning that an prefix has been categorized according to the TAG with a label `Anycast`. 27 | ``` 28 | (:BGPPrefix {prefix: '43.249.213.0/24'})-[:CATEGORIZED]-(:Tag {label: 'Anycast'}) 29 | ``` 30 | 31 | ## Dependence 32 | 33 | This crawler is not depending on other crawlers. 34 | -------------------------------------------------------------------------------- /iyp/crawlers/bgptools/as_names.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from io import BytesIO 5 | 6 | import pandas as pd 7 | import requests 8 | 9 | from iyp import BaseCrawler 10 | 11 | URL = 'https://bgp.tools/asns.csv' 12 | ORG = 'BGP.Tools' 13 | NAME = 'bgptools.as_names' 14 | 15 | 16 | class Crawler(BaseCrawler): 17 | def __init__(self, organization, url, name): 18 | super().__init__(organization, url, name) 19 | self.reference['reference_url_info'] = 'https://bgp.tools/kb/api' 20 | 21 | self.headers = { 22 | 'user-agent': 'IIJ/Internet Health Report - admin@ihr.live' 23 | } 24 | 25 | @staticmethod 26 | def replace_link_ids(links: list, src_id: dict = dict(), dst_id=dict()): 27 | """Replace the src_id and dst_id values from links with their actual id.""" 28 | for link in links: 29 | if src_id: 30 | link['src_id'] = src_id[link['src_id']] 31 | if dst_id: 32 | link['dst_id'] = dst_id[link['dst_id']] 33 | 34 | def run(self): 35 | """Fetch the AS name file from BGP.Tools website and push it to IYP.""" 36 | 37 | req = requests.get(URL, headers=self.headers) 38 | req.raise_for_status() 39 | 40 | df = pd.read_csv(BytesIO(req.content), keep_default_na=False) 41 | 42 | asns = set() 43 | names = set() 44 | tags = set() 45 | name_links = list() 46 | tag_links = list() 47 | 48 | # Normally we would use itertuples, since it is way faster. But we want to be 49 | # robust against format changes and since one column is called "class", which is 50 | # a Python keyword, the field name would be replaced by a positional value, 51 | # e.g., r._3 instead of r.class, which means that if the format is changed, this 52 | # crawler breaks again. 53 | # Since the data set is not too large, iterrows is fine performance-wise. 54 | for r in df.iterrows(): 55 | has_link = False 56 | entry = r[1] 57 | asn = entry['asn'] 58 | if not asn.startswith('AS'): 59 | logging.warning(f'asn field does not start with "AS": {entry}') 60 | continue 61 | asn = int(asn[2:]) 62 | name = entry['name'] 63 | if name != 'ERR_AS_NAME_NOT_FOUND': 64 | names.add(name) 65 | name_links.append({'src_id': asn, 'dst_id': name, 'props': [self.reference]}) 66 | has_link = True 67 | tag = entry['class'] 68 | if tag != 'Unknown': 69 | tags.add(tag) 70 | tag_links.append({'src_id': asn, 'dst_id': tag, 'props': [self.reference]}) 71 | has_link = True 72 | if has_link: 73 | # Only create AS nodes if we have a relationship. 74 | asns.add(asn) 75 | 76 | asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False) 77 | name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False) 78 | tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', tags, all=False) 79 | 80 | self.replace_link_ids(name_links, asn_id, name_id) 81 | self.replace_link_ids(tag_links, asn_id, tag_id) 82 | 83 | self.iyp.batch_add_links('NAME', name_links) 84 | self.iyp.batch_add_links('CATEGORIZED', tag_links) 85 | 86 | def unit_test(self): 87 | return super().unit_test(['NAME']) 88 | 89 | 90 | def main() -> None: 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument('--unit-test', action='store_true') 93 | args = parser.parse_args() 94 | 95 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 96 | logging.basicConfig( 97 | format=FORMAT, 98 | filename='log/' + NAME + '.log', 99 | level=logging.INFO, 100 | datefmt='%Y-%m-%d %H:%M:%S' 101 | ) 102 | 103 | logging.info(f'Started: {sys.argv}') 104 | 105 | crawler = Crawler(ORG, URL, NAME) 106 | if args.unit_test: 107 | crawler.unit_test() 108 | else: 109 | crawler.run() 110 | crawler.close() 111 | logging.info(f'Finished: {sys.argv}') 112 | 113 | 114 | if __name__ == '__main__': 115 | main() 116 | sys.exit(0) 117 | -------------------------------------------------------------------------------- /iyp/crawlers/bgptools/tags.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from datetime import datetime, time, timezone 5 | 6 | import requests 7 | 8 | from iyp import BaseCrawler 9 | 10 | # curl -s https://bgp.tools/asns.csv | head -n 5 11 | URL = 'https://bgp.tools/tags/' 12 | ORG = 'BGP.Tools' 13 | NAME = 'bgptools.tags' 14 | 15 | TAGS = { 16 | 'cdn': 'Content Delivery Network', 17 | 'dsl': 'Home ISP', 18 | 'a10k': 'Tranco 10k Host', 19 | 'icrit': 'Internet Critical Infra', 20 | 'tor': 'ToR Services', 21 | 'anycast': 'Anycast', 22 | 'perso': 'Personal ASN', 23 | 'ddosm': 'DDoS Mitigation', 24 | 'vpn': 'VPN Host', 25 | 'vpsh': 'Server Hosting', 26 | 'uni': 'Academic', 27 | 'gov': 'Government', 28 | 'event': 'Event', 29 | 'mobile': 'Mobile Data/Carrier', 30 | 'satnet': 'Satellite Internet', 31 | 'biznet': 'Business Broadband', 32 | 'corp': 'Corporate/Enterprise', 33 | 'rpkirov': 'Validating RPKI ROV' 34 | } 35 | 36 | 37 | class Crawler(BaseCrawler): 38 | def __init__(self, organization, url, name): 39 | super().__init__(organization, url, name) 40 | self.reference['reference_url_info'] = 'https://bgp.tools/kb/api' 41 | 42 | self.headers = { 43 | 'user-agent': 'IIJ/Internet Health Report - admin@ihr.live' 44 | } 45 | 46 | def run(self): 47 | """Fetch the AS name file from BGP.Tools website and process lines one by 48 | one.""" 49 | 50 | for tag, label in TAGS.items(): 51 | url = URL + tag + '.csv' 52 | # Reference information for data pushed to the wikibase 53 | self.reference = { 54 | 'reference_org': ORG, 55 | 'reference_url_data': url, 56 | 'reference_name': NAME, 57 | 'reference_time_fetch': datetime.combine(datetime.utcnow(), time.min, timezone.utc) 58 | } 59 | 60 | req = requests.get(url, headers=self.headers) 61 | req.raise_for_status() 62 | 63 | self.tag_qid = self.iyp.get_node('Tag', {'label': label}) 64 | for line in req.text.splitlines(): 65 | # skip header 66 | if line.startswith('asn'): 67 | continue 68 | 69 | # Parse given line to get ASN, name, and country code 70 | asn, _, _ = line.partition(',') 71 | asn_qid = self.iyp.get_node('AS', {'asn': asn[2:]}) 72 | statements = [['CATEGORIZED', self.tag_qid, self.reference]] # Set AS name 73 | 74 | # Update AS name and country 75 | self.iyp.add_links(asn_qid, statements) 76 | 77 | def unit_test(self): 78 | return super().unit_test(['CATEGORIZED']) 79 | 80 | 81 | def main() -> None: 82 | parser = argparse.ArgumentParser() 83 | parser.add_argument('--unit-test', action='store_true') 84 | args = parser.parse_args() 85 | 86 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 87 | logging.basicConfig( 88 | format=FORMAT, 89 | filename='log/' + NAME + '.log', 90 | level=logging.INFO, 91 | datefmt='%Y-%m-%d %H:%M:%S' 92 | ) 93 | 94 | logging.info(f'Started: {sys.argv}') 95 | 96 | crawler = Crawler(ORG, URL, NAME) 97 | if args.unit_test: 98 | crawler.unit_test() 99 | else: 100 | crawler.run() 101 | crawler.close() 102 | logging.info(f'Finished: {sys.argv}') 103 | 104 | 105 | if __name__ == '__main__': 106 | main() 107 | sys.exit(0) 108 | -------------------------------------------------------------------------------- /iyp/crawlers/caida/README.md: -------------------------------------------------------------------------------- 1 | # CAIDA -- https://caida.org 2 | 3 | ## ASRank (asrank.py) 4 | 5 | AS rank in terms of customer cone size, meaning that large transit providers are 6 | higher ranked. 7 | 8 | ### Graph representation 9 | 10 | Ranking: 11 | 12 | Connect ASes nodes to a single ranking node corresponding to ASRank. The rank is 13 | given as a link attribute. 14 | For example: 15 | 16 | ```cypher 17 | (:AS {asn:2497})-[:RANK {rank:87}]-(:Ranking {name:'CAIDA ASRank'}) 18 | ``` 19 | 20 | Country: 21 | 22 | Connect AS to country nodes, meaning that the AS is registered in that country. 23 | 24 | ```cypher 25 | (:AS)-[:COUNTRY]-(:Country) 26 | ``` 27 | 28 | AS name: 29 | 30 | Connect AS to names nodes, providing the name of an AS. 31 | For example: 32 | 33 | ```cypher 34 | (:AS {asn:2497})-[:NAME]-(:Name {name:'IIJ'}) 35 | ``` 36 | 37 | ### Dependence 38 | 39 | The asrank crawler is not depending on other crawlers. 40 | 41 | ## IXPs (ixs.py) 42 | 43 | List of IXPs obtained from PeeringDB, Hurricane Electric, Packet Clearing House. 44 | 45 | ### Graph representation 46 | 47 | Nodes: 48 | 49 | - `(:IXP {name})`: IXP node 50 | - `(:Name {name})`: Name of IXP 51 | - `(:PeeringLAN {prefix})`: Prefix of IXP peering LAN 52 | - `(:CaidaIXID {id})`: ID of the IXP assigned by CAIDA 53 | - `(:Country {country_code})`: Country code 54 | - `(:URL {url})`: Website of IXP 55 | 56 | Relationships: 57 | 58 | ```Cypher 59 | (:IXP)-[:COUNTRY]->(:Country) 60 | (:IXP)-[:EXTERNAL_ID]->(:CaidaIXID) 61 | (:IXP)-[:NAME]->(:Name) 62 | (:IXP)-[:WEBSITE]->(:URL) 63 | (:PeeringLAN)-[:MANAGED_BY]->(:IXP) 64 | ``` 65 | 66 | ### Dependence 67 | 68 | The ixs crawler depends on the peeringdb.ix crawler. 69 | 70 | ## IXP memberships (ix_asns.py) 71 | 72 | List of ASes present at each IXP. 73 | 74 | ### Graph representation 75 | 76 | Relationships: 77 | 78 | ```cypher 79 | (:AS)-[:MEMBER_OF]->(:IXP) 80 | ``` 81 | 82 | ### Dependence 83 | 84 | The ix_asns crawler depends on the ixs crawler. 85 | 86 | ## AS relationships (as_relationships_v[4|6].py) 87 | 88 | Inferred AS relationships (peer-to-peer or customer-provider). 89 | 90 | ### Graph representation 91 | 92 | ```cypher 93 | (:AS {asn: 2497})-[r:PEERS_WITH {af: 4, rel: -1}]->(:AS {asn: 7500}) 94 | ``` 95 | 96 | Either the `reference_name` or `af` properties can be used to distinguish between IPv4 97 | and IPv6. 98 | 99 | `rel: -1` indicates customer-provider, and the direction of the relationship is modeled 100 | as `provider -> customer` to be consistent with `bgpkit.as2rel`. 101 | 102 | `rel: 0` indicates peer-to-peer relationship. 103 | 104 | **Note:** While both CAIDA and BGPKIT use `rel: 0` to indicate a peer-to-peer 105 | relationship, BGPKIT uses `rel: 1` for customer-provider, whereas CAIDA uses `rel: -1`. 106 | 107 | ### Dependence 108 | 109 | The as_relatonship crawler does not depend on other crawlers. 110 | 111 | ## AS Organizations (as2org.py) 112 | 113 | AS to organization mappings derived from the quarterly WHOIS dumps. 114 | 115 | ### Graph representation 116 | 117 | ```cypher 118 | (:AS {asn: 2497})-[:MANAGED_BY {org_id: '@aut-2497-JPNIC'}]->(:Organization {name: 'Internet Initiative Japan Inc.'}) 119 | (:Organization {name: 'Internet Initiative Japan Inc.'})-[:COUNTRY {org_ids: ['@aut-2497-JPNIC']}]->(:Country {country_code: 'JP'}) 120 | (:Organization {name: 'Internet Initiative Japan Inc.'})-[:NAME {org_ids: ['@aut-2497-JPNIC']}]->(:Name {name: 'Internet Initiative Japan Inc.'}) 121 | (:Organization {name: 'Internet Initiative Japan Inc.'})-[:EXTERNAL_ID]->(:CaidaOrgID {id: '@aut-2497-JPNIC'}) 122 | ``` 123 | 124 | We keep track of the org_id assigned by CAIDA in both the relationships and by adding a 125 | `CaidaOrgID` node. The main reason for this is that there are organizations with the 126 | same name but different IDs that may map to different countries. 127 | 128 | ### Dependence 129 | 130 | The as2org crawler does not depend on other crawlers. 131 | -------------------------------------------------------------------------------- /iyp/crawlers/caida/__init__.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import logging 3 | import os 4 | from datetime import datetime, timezone 5 | from io import BytesIO 6 | 7 | import requests 8 | from bs4 import BeautifulSoup 9 | 10 | from iyp import BaseCrawler 11 | 12 | 13 | class ASRelCrawler(BaseCrawler): 14 | def __init__(self, organization, url, name, af): 15 | super().__init__(organization, url, name) 16 | self.af = af 17 | self.reference['reference_url_info'] = \ 18 | 'https://publicdata.caida.org/datasets/as-relationships/serial-1/README.txt' 19 | 20 | def __get_latest_file(self): 21 | index = requests.get(self.reference['reference_url_data']) 22 | index.raise_for_status() 23 | soup = BeautifulSoup(index.text, features='html.parser') 24 | if self.af == 4: 25 | filename_template = '%Y%m%d.as-rel.txt.bz2' 26 | else: 27 | filename_template = '%Y%m%d.as-rel.v6-stable.txt.bz2' 28 | links = soup.find_all('a') 29 | file_dates = list() 30 | for link in links: 31 | try: 32 | dt = datetime.strptime(link['href'], filename_template).replace(tzinfo=timezone.utc) 33 | except ValueError: 34 | continue 35 | file_dates.append((dt, link['href'])) 36 | file_dates.sort() 37 | latest_file_date, latest_file_name = file_dates[-1] 38 | self.reference['reference_time_modification'] = latest_file_date 39 | self.reference['reference_url_data'] = os.path.join(self.reference['reference_url_data'], latest_file_name) 40 | logging.info(f'Fetching file: {self.reference["reference_url_data"]}') 41 | 42 | def run(self): 43 | self.__get_latest_file() 44 | req = requests.get(self.reference['reference_url_data']) 45 | req.raise_for_status() 46 | 47 | with bz2.open(BytesIO(req.content), 'rb') as f: 48 | text = f.read().decode() 49 | 50 | ases = set() 51 | peers_with_links = list() 52 | for line in text.splitlines(): 53 | if line.startswith('#'): 54 | continue 55 | left_asn, right_asn, kind = map(int, line.split('|')) 56 | ases.add(left_asn) 57 | ases.add(right_asn) 58 | peers_with_links.append({'src_id': left_asn, 'dst_id': right_asn, 59 | 'props': [self.reference, {'rel': kind, 'af': self.af}]}) 60 | 61 | as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', ases) 62 | 63 | for link in peers_with_links: 64 | link['src_id'] = as_id[link['src_id']] 65 | link['dst_id'] = as_id[link['dst_id']] 66 | 67 | self.iyp.batch_add_links('PEERS_WITH', peers_with_links) 68 | 69 | def unit_test(self): 70 | return super().unit_test(['PEERS_WITH']) 71 | -------------------------------------------------------------------------------- /iyp/crawlers/caida/as_relationships_v4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.caida import ASRelCrawler 6 | 7 | URL = 'https://publicdata.caida.org/datasets/as-relationships/serial-1/' 8 | ORG = 'CAIDA' 9 | NAME = 'caida.as_relationships_v4' 10 | 11 | 12 | class Crawler(ASRelCrawler): 13 | def __init__(self, organization, url, name): 14 | super().__init__(organization, url, name, 4) 15 | 16 | 17 | def main() -> None: 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--unit-test', action='store_true') 20 | args = parser.parse_args() 21 | 22 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 23 | logging.basicConfig( 24 | format=FORMAT, 25 | filename='log/' + NAME + '.log', 26 | level=logging.INFO, 27 | datefmt='%Y-%m-%d %H:%M:%S' 28 | ) 29 | 30 | logging.info(f'Started: {sys.argv}') 31 | 32 | crawler = Crawler(ORG, URL, NAME) 33 | if args.unit_test: 34 | crawler.unit_test() 35 | else: 36 | crawler.run() 37 | crawler.close() 38 | logging.info(f'Finished: {sys.argv}') 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | sys.exit(0) 44 | -------------------------------------------------------------------------------- /iyp/crawlers/caida/as_relationships_v6.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.caida import ASRelCrawler 6 | 7 | URL = 'https://publicdata.caida.org/datasets/as-relationships/serial-1/' 8 | ORG = 'CAIDA' 9 | NAME = 'caida.as_relationships_v6' 10 | 11 | 12 | class Crawler(ASRelCrawler): 13 | def __init__(self, organization, url, name): 14 | super().__init__(organization, url, name, 6) 15 | 16 | 17 | def main() -> None: 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--unit-test', action='store_true') 20 | args = parser.parse_args() 21 | 22 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 23 | logging.basicConfig( 24 | format=FORMAT, 25 | filename='log/' + NAME + '.log', 26 | level=logging.INFO, 27 | datefmt='%Y-%m-%d %H:%M:%S' 28 | ) 29 | 30 | logging.info(f'Started: {sys.argv}') 31 | 32 | crawler = Crawler(ORG, URL, NAME) 33 | if args.unit_test: 34 | crawler.unit_test() 35 | else: 36 | crawler.run() 37 | crawler.close() 38 | logging.info(f'Finished: {sys.argv}') 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | sys.exit(0) 44 | -------------------------------------------------------------------------------- /iyp/crawlers/caida/ix_asns.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import sys 5 | from datetime import datetime, timezone 6 | 7 | import arrow 8 | import flatdict 9 | import requests 10 | 11 | from iyp import BaseCrawler 12 | 13 | URL = 'https://publicdata.caida.org/datasets/ixps/' 14 | ORG = 'CAIDA' 15 | NAME = 'caida.ix_asns' 16 | 17 | 18 | class Crawler(BaseCrawler): 19 | def __init__(self, organization, url, name): 20 | """Initialization: Find the latest file and set the URL""" 21 | 22 | date = arrow.now() 23 | 24 | for _ in range(6): 25 | full_url = url + f'ix-asns_{date.year}{date.month:02d}.jsonl' 26 | req = requests.head(full_url) 27 | 28 | # Found the latest file 29 | if req.status_code == 200: 30 | url = full_url 31 | break 32 | 33 | date = date.shift(months=-1) 34 | 35 | else: 36 | # for loop was not 'broken', no file available 37 | raise Exception('No recent CAIDA ix-asns file available') 38 | date = date.datetime.replace(day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc) 39 | 40 | logging.info(f'Fetching data from: {url}') 41 | super().__init__(organization, url, name) 42 | self.reference['reference_url_info'] = 'https://publicdata.caida.org/datasets/ixps/README.txt' 43 | self.reference['reference_time_modification'] = date 44 | 45 | def __set_modification_time_from_metadata_line(self, line): 46 | try: 47 | date_str = json.loads(line.lstrip('#'))['date'] 48 | date = datetime.strptime(date_str, '%Y.%m.%d %H:%M:%S').replace(tzinfo=timezone.utc) 49 | self.reference['reference_time_modification'] = date 50 | except (json.JSONDecodeError, KeyError, ValueError) as e: 51 | logging.warning(f'Failed to get modification date from metadata line: {line.strip()}') 52 | logging.warning(e) 53 | logging.warning('Using date from filename.') 54 | 55 | def run(self): 56 | """Fetch the latest file and process lines one by one.""" 57 | 58 | req = requests.get(self.url) 59 | req.raise_for_status() 60 | 61 | lines = [] 62 | asns = set() 63 | 64 | # Find all possible values and create corresponding nodes 65 | for line in req.text.splitlines(): 66 | if line.startswith('#'): 67 | self.__set_modification_time_from_metadata_line(line) 68 | continue 69 | 70 | ix = json.loads(line) 71 | lines.append(ix) 72 | asns.add(int(ix.get('asn'))) 73 | 74 | # get node IDs for ASNs, names, and countries 75 | ixp_id = self.iyp.batch_get_node_extid('CaidaIXID') 76 | as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns) 77 | 78 | # Compute links and add them to neo4j 79 | member_links = [] 80 | 81 | for mem in lines: 82 | ixp_qid = ixp_id.get(mem['ix_id']) 83 | asn_qid = as_id.get(mem['asn']) 84 | flat_mem = dict(flatdict.FlatDict(mem)) 85 | 86 | member_links.append({'src_id': asn_qid, 'dst_id': ixp_qid, 87 | 'props': [self.reference, flat_mem]}) 88 | # Push all links to IYP 89 | self.iyp.batch_add_links('MEMBER_OF', member_links) 90 | 91 | def unit_test(self): 92 | return super().unit_test(['MEMBER_OF']) 93 | 94 | 95 | def main() -> None: 96 | parser = argparse.ArgumentParser() 97 | parser.add_argument('--unit-test', action='store_true') 98 | args = parser.parse_args() 99 | 100 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 101 | logging.basicConfig( 102 | format=FORMAT, 103 | filename='log/' + NAME + '.log', 104 | level=logging.INFO, 105 | datefmt='%Y-%m-%d %H:%M:%S' 106 | ) 107 | 108 | logging.info(f'Started: {sys.argv}') 109 | 110 | crawler = Crawler(ORG, URL, NAME) 111 | if args.unit_test: 112 | crawler.unit_test() 113 | else: 114 | crawler.run() 115 | crawler.close() 116 | logging.info(f'Finished: {sys.argv}') 117 | 118 | 119 | if __name__ == '__main__': 120 | main() 121 | sys.exit(0) 122 | -------------------------------------------------------------------------------- /iyp/crawlers/cisco/README.md: -------------------------------------------------------------------------------- 1 | # Cisco Umbrella -- https://umbrella-static.s3-us-west-1.amazonaws.com/index.html 2 | 3 | The popularity list contains most queried domains (ranging from TLDs to FQDNs) 4 | based on passive DNS usage across the Umbrella global network. 5 | 6 | IYP uses this data to create and annotate DomainName and HostName nodes. 7 | 8 | ## Graph representation 9 | 10 | The rank of the domain is indicated by the `rank` property of the relationship. 11 | 12 | ```Cypher 13 | (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'}) 14 | (:HostName {name: 'www.google.com'})-[:RANK {rank: 8}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'}) 15 | ``` 16 | 17 | ## Dependence 18 | 19 | This crawler depends on `openintel.umbrella1m`. 20 | -------------------------------------------------------------------------------- /iyp/crawlers/citizenlab/README.md: -------------------------------------------------------------------------------- 1 | # Citizen Lab -- https://github.com/citizenlab/test-lists/blob/master/lists/ 2 | 3 | Citizen lab's test lists are URL testing lists intended to help in testing URL censorship, divided by country codes. In addition to these local lists, the global list consists of a wide range of internationally relevant and popular websites, including sites with content that is perceived to be provocative or objectionable. The dataset tha maps URL to Category. 4 | 5 | ## Graph representation 6 | 7 | ### URL tags 8 | Connect URL to tag nodes meaning that an URL has been categorized according to the 9 | given tag. 10 | ``` 11 | (:URL {url: "https://www.flipkart.com/"})-[:CATEGORIZED]->(:Tag {label: 'COMM'}) 12 | ``` 13 | 14 | ## Dependence 15 | 16 | This crawler is not depending on other crawlers. 17 | -------------------------------------------------------------------------------- /iyp/crawlers/citizenlab/urldb.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from urllib.error import HTTPError 5 | 6 | import pandas as pd 7 | 8 | from iyp import BaseCrawler, RequestStatusError 9 | 10 | ORG = 'Citizen Lab' 11 | URL = 'https://github.com/citizenlab/test-lists/blob/master/lists/' 12 | NAME = 'citizenlab.urldb' 13 | 14 | 15 | def generate_url(suffix): 16 | base_url = 'https://raw.githubusercontent.com/citizenlab/test-lists/master/lists/' 17 | joined_url = ''.join([base_url, suffix, '.csv']) 18 | return joined_url 19 | 20 | 21 | class Crawler(BaseCrawler): 22 | def __init__(self, organization, url, name): 23 | super().__init__(organization, url, name) 24 | self.reference['reference_url_info'] = 'https://github.com/citizenlab/test-lists' 25 | 26 | def run(self): 27 | # Fetch country codes to generate urls 28 | try: 29 | cc_df = pd.read_csv(generate_url('00-LEGEND-country_codes'), keep_default_na=False) 30 | except Exception as e: 31 | logging.error(f'Failed to fetch country codes: {e}') 32 | raise RequestStatusError('Error while fetching data file') 33 | 34 | country_codes = [e.lower() for e in cc_df['CountryCode']] 35 | 36 | # Iterate through country_codes, generate an url, download the csv file, extract 37 | # the necessary information from the csv file, and push the data to IYP. 38 | relationship_pairs = set() 39 | urls = set() 40 | categories = set() 41 | 42 | for code in country_codes: 43 | # Not all country codes have CSV files. 44 | try: 45 | df = pd.read_csv(generate_url(code)) 46 | except HTTPError as e: 47 | # 404 is expected, everything else is not. 48 | if e.getcode() != 404: 49 | logging.warning(f'Request for country code "{code}" failed with error: {e}') 50 | raise e 51 | continue 52 | 53 | for row in df.itertuples(): 54 | url = row.url 55 | category = row.category_description 56 | urls.add(url) 57 | categories.add(category) 58 | relationship_pairs.add((url, category)) 59 | 60 | url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url', urls, all=False) 61 | category_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', categories, all=False) 62 | 63 | links = list() 64 | for (url, category) in relationship_pairs: 65 | url_qid = url_id[url] 66 | category_qid = category_id[category] 67 | links.append({'src_id': url_qid, 'dst_id': category_qid, 'props': [self.reference]}) 68 | 69 | # Push all links to IYP 70 | self.iyp.batch_add_links('CATEGORIZED', links) 71 | 72 | def unit_test(self): 73 | return super().unit_test(['CATEGORIZED']) 74 | 75 | 76 | def main() -> None: 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument('--unit-test', action='store_true') 79 | args = parser.parse_args() 80 | 81 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 82 | logging.basicConfig( 83 | format=FORMAT, 84 | filename='log/' + NAME + '.log', 85 | level=logging.INFO, 86 | datefmt='%Y-%m-%d %H:%M:%S' 87 | ) 88 | 89 | logging.info(f'Started: {sys.argv}') 90 | 91 | crawler = Crawler(ORG, URL, NAME) 92 | if args.unit_test: 93 | crawler.unit_test() 94 | else: 95 | crawler.run() 96 | crawler.close() 97 | logging.info(f'Finished: {sys.argv}') 98 | 99 | 100 | if __name__ == '__main__': 101 | main() 102 | sys.exit(0) 103 | -------------------------------------------------------------------------------- /iyp/crawlers/cloudflare/README.md: -------------------------------------------------------------------------------- 1 | # Cloudflare Radar -- https://radar.cloudflare.com/ 2 | 3 | Cloudflare uses aggregated and anonymized DNS queries to their `1.1.1.1` public resolver service to 4 | provide various datasets, including: 5 | 6 | - [Ordered top 100 domain 7 | names](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/): The 100 8 | domains with the most DNS queries, including assigned ranks. 9 | - [Unordered top 1,000 to 1,000,000 10 | domains](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/): Same as 11 | above, but there are no ranks assigned. Fetched set sizes are 1,000, 2,000, 5,000, 10,000, 20,000, 12 | 50,000, 100,000, 200,000, 500,000, and 1,000,000. 13 | - [Top 100 countries querying each of the 10,000 highest ranked domain 14 | names](https://developers.cloudflare.com/radar/investigate/dns/#top-locations): For each domain 15 | that is in the top 10,000 of *any* ranking included in IYP, fetch the top 100 countries with the 16 | most DNS queries. 17 | - [Top 100 ASes querying each of the 10,000 highest ranked domain 18 | names](https://developers.cloudflare.com/api/operations/radar-get-dns-top-ases): Same as above, but 19 | fetch AS numbers instead. 20 | 21 | All rankings are based on one week of data. 22 | Cloudflare radar's top location and ASes is available for both domain names 23 | and host names. Results are likely accounting for all NS, A, AAAA queries made to 24 | Cloudflare's resolver. Since NS queries for host names make no sense IYP links these 25 | results to `DomainName` nodes. 26 | 27 | ## Graph representation 28 | 29 | ### Ordered top 100 domain names - `top100.py` 30 | 31 | Connect DomainName nodes to a single Ranking node corresponding to the ordered Cloudflare top 100 32 | ranking. The `rank` is given as a relationship property. 33 | 34 | ```Cypher 35 | (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cloudflare top 100 domains'}) 36 | ``` 37 | 38 | ### Unordered top *n* domain names - `ranking_bucket.py` 39 | 40 | Connect DomainName nodes to a single Ranking node corresponding to the unordered Cloudflare top *n* 41 | ranking. There is no rank assigned to the domain name, but *n* is specified in the `top` property of 42 | the Ranking node. 43 | 44 | ```Cypher 45 | (:DomainName {name: 'google.com'})-[:RANK]->(:Ranking {name: 'Cloudflare Top 1000 ranking domains', top: 1000}) 46 | ``` 47 | 48 | ### Top countries - `dns_top_locations.py` 49 | 50 | Connect each DomainName node to up to 100 Country nodes representing the countries from which the 51 | domain was queried from the most. The `value` property of the QUERIED_FROM relationship describes 52 | the percentage of all queries (within one week) originating from the country. 53 | 54 | ```Cypher 55 | (:DomainName {name: 'google.com'})-[:QUERIED_FROM {value: 37.05}]->(:Country {country_code: 'US'}) 56 | ``` 57 | 58 | ### Top ASes - `dns_top_ases.py` 59 | 60 | Connect each DomainName node to up to 100 AS nodes from which the domain was queried from the most. 61 | The `value` property of the QUERIED_FROM relationship describes the percentage of all queries 62 | (within one week) originating from the AS. 63 | 64 | ```Cypher 65 | (:DomainName {name: 'google.com'}-[:QUERIED_FROM {value: 3.51}]->(:AS {asn: 714})) 66 | ``` 67 | 68 | ## Dependence 69 | 70 | The `dns_top_locations` and `dns_top_ases` crawlers should be run after all crawlers that produce 71 | `(:DomainName)-[:RANK {rank: n}]->(:Ranking)` relationships: 72 | 73 | - `cloudflare.top100` 74 | - `tranco.top1m` 75 | 76 | ## Notes 77 | 78 | This crawler requires an application key to access the radar's API. 79 | -------------------------------------------------------------------------------- /iyp/crawlers/cloudflare/dns_top_ases.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | import flatdict 6 | 7 | from iyp.crawlers.cloudflare import DnsTopCrawler 8 | 9 | ORG = 'Cloudflare' 10 | URL = 'https://api.cloudflare.com/client/v4/radar/dns/top/ases/' 11 | NAME = 'cloudflare.dns_top_ases' 12 | 13 | 14 | class Crawler(DnsTopCrawler): 15 | def __init__(self, organization, url, name): 16 | super().__init__(organization, url, name) 17 | 18 | self.reference['reference_url_info'] = 'https://developers.cloudflare.com/api/operations/radar-get-dns-top-ases' 19 | 20 | def compute_link(self, param): 21 | 22 | name, ases = param 23 | 24 | # 'meta' result it not a domain, but contains metadata so skip. 25 | if name == 'meta': 26 | return 27 | 28 | qids = list() 29 | if name in self.domain_names_id: 30 | qids.append(self.domain_names_id[name]) 31 | if name in self.host_names_id: 32 | qids.append(self.host_names_id[name]) 33 | 34 | for entry in ases: 35 | if not entry: 36 | continue 37 | 38 | asn = entry['clientASN'] 39 | self.to_nodes.add(asn) 40 | 41 | entry['value'] = float(entry['value']) 42 | 43 | flat_prop = dict(flatdict.FlatDict(entry)) 44 | for qid in qids: 45 | self.links.append({ 46 | 'src_id': qid, 47 | 'dst_id': asn, 48 | 'props': [flat_prop, self.reference] 49 | }) 50 | 51 | def map_links(self): 52 | as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', self.to_nodes, all=False) 53 | for link in self.links: 54 | link['dst_id'] = as_id[link['dst_id']] 55 | 56 | 57 | def main() -> None: 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument('--unit-test', action='store_true') 60 | args = parser.parse_args() 61 | 62 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 63 | logging.basicConfig( 64 | format=FORMAT, 65 | filename='log/' + NAME + '.log', 66 | level=logging.INFO, 67 | datefmt='%Y-%m-%d %H:%M:%S' 68 | ) 69 | 70 | logging.info(f'Started: {sys.argv}') 71 | 72 | crawler = Crawler(ORG, URL, NAME) 73 | if args.unit_test: 74 | crawler.unit_test() 75 | else: 76 | crawler.run() 77 | crawler.close() 78 | logging.info(f'Finished: {sys.argv}') 79 | 80 | 81 | if __name__ == '__main__': 82 | main() 83 | sys.exit(0) 84 | -------------------------------------------------------------------------------- /iyp/crawlers/cloudflare/dns_top_locations.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | import flatdict 6 | 7 | from iyp.crawlers.cloudflare import DnsTopCrawler 8 | 9 | ORG = 'Cloudflare' 10 | URL = 'https://api.cloudflare.com/client/v4/radar/dns/top/locations/' 11 | NAME = 'cloudflare.dns_top_locations' 12 | 13 | 14 | class Crawler(DnsTopCrawler): 15 | def __init__(self, organization, url, name): 16 | super().__init__(organization, url, name) 17 | 18 | self.reference['reference_url_info'] = 'https://developers.cloudflare.com/radar/investigate/dns/#top-locations' 19 | 20 | def compute_link(self, param): 21 | 22 | domain, countries = param 23 | 24 | # 'meta' result it not a domain, but contains metadata so skip. 25 | if domain == 'meta': 26 | return 27 | 28 | domain_qid = self.domain_names_id[domain] 29 | 30 | for entry in countries: 31 | if not entry: 32 | continue 33 | 34 | cc = entry['clientCountryAlpha2'] 35 | self.to_nodes.add(cc) 36 | 37 | entry['value'] = float(entry['value']) 38 | 39 | flat_prop = dict(flatdict.FlatDict(entry)) 40 | self.links.append({ 41 | 'src_id': domain_qid, 42 | 'dst_id': cc, 43 | 'props': [flat_prop, self.reference] 44 | }) 45 | 46 | def map_links(self): 47 | cc_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', self.to_nodes, all=False) 48 | for link in self.links: 49 | link['dst_id'] = cc_id[link['dst_id']] 50 | 51 | 52 | def main() -> None: 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument('--unit-test', action='store_true') 55 | args = parser.parse_args() 56 | 57 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 58 | logging.basicConfig( 59 | format=FORMAT, 60 | filename='log/' + NAME + '.log', 61 | level=logging.INFO, 62 | datefmt='%Y-%m-%d %H:%M:%S' 63 | ) 64 | 65 | logging.info(f'Started: {sys.argv}') 66 | 67 | crawler = Crawler(ORG, URL, NAME) 68 | if args.unit_test: 69 | crawler.unit_test() 70 | else: 71 | crawler.run() 72 | crawler.close() 73 | logging.info(f'Finished: {sys.argv}') 74 | 75 | 76 | if __name__ == '__main__': 77 | main() 78 | sys.exit(0) 79 | -------------------------------------------------------------------------------- /iyp/crawlers/cloudflare/top100.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import sys 6 | from datetime import datetime, timezone 7 | 8 | import requests 9 | 10 | from iyp import BaseCrawler 11 | 12 | # Organization name and URL to data 13 | ORG = 'Cloudflare' 14 | URL = 'https://api.cloudflare.com/client/v4/radar/ranking/top?name=top&limit=100&format=json' 15 | NAME = 'cloudflare.top100' 16 | 17 | API_KEY = '' 18 | if os.path.exists('config.json'): 19 | API_KEY = json.load(open('config.json', 'r'))['cloudflare']['apikey'] 20 | 21 | 22 | class Crawler(BaseCrawler): 23 | # Base Crawler provides access to IYP via self.iyp 24 | # and setup a dictionary with the org/url/today's date in self.reference 25 | # 26 | # Cloudflare ranks second and third level domain names (not host names). 27 | # See https://blog.cloudflare.com/radar-domain-rankings/ 28 | def __init__(self, organization, url, name): 29 | super().__init__(organization, url, name) 30 | self.reference['reference_url_info'] = 'https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/' # noqa: E501 31 | 32 | def run(self): 33 | """Fetch data and push to IYP.""" 34 | 35 | self.cf_qid = self.iyp.get_node( 36 | 'Ranking', {'name': 'Cloudflare top 100 domains'}) 37 | 38 | # Fetch data 39 | headers = { 40 | 'Authorization': 'Bearer ' + API_KEY, 41 | 'Content-Type': 'application/json' 42 | } 43 | 44 | req = requests.get(self.reference['reference_url_data'], headers=headers) 45 | req.raise_for_status() 46 | 47 | results = req.json()['result'] 48 | 49 | try: 50 | date_str = results['meta']['dateRange'][0]['endTime'] 51 | date = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc) 52 | self.reference['reference_time_modification'] = date 53 | except (KeyError, ValueError, TypeError) as e: 54 | logging.warning(f'Failed to get modification time: {e}') 55 | 56 | # Process line one after the other 57 | processed = list(map(self.update, results['top'])) 58 | logging.info(f'Processed {len(processed)} lines') 59 | 60 | def update(self, entry): 61 | """Add the entry to IYP if it's not already there and update its properties.""" 62 | 63 | # set rank 64 | statements = [['RANK', self.cf_qid, dict({'rank': entry['rank']}, **self.reference)]] 65 | 66 | # Commit to IYP 67 | # Get the AS's node ID (create if it is not yet registered) and commit changes 68 | domain_qid = self.iyp.get_node('DomainName', {'name': entry['domain']}) 69 | self.iyp.add_links(domain_qid, statements) 70 | 71 | def unit_test(self): 72 | return super().unit_test(['RANK']) 73 | 74 | 75 | def main() -> None: 76 | parser = argparse.ArgumentParser() 77 | parser.add_argument('--unit-test', action='store_true') 78 | args = parser.parse_args() 79 | 80 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 81 | logging.basicConfig( 82 | format=FORMAT, 83 | filename='log/' + NAME + '.log', 84 | level=logging.INFO, 85 | datefmt='%Y-%m-%d %H:%M:%S' 86 | ) 87 | 88 | logging.info(f'Started: {sys.argv}') 89 | 90 | crawler = Crawler(ORG, URL, NAME) 91 | if args.unit_test: 92 | crawler.unit_test() 93 | else: 94 | crawler.run() 95 | crawler.close() 96 | logging.info(f'Finished: {sys.argv}') 97 | 98 | 99 | if __name__ == '__main__': 100 | main() 101 | sys.exit(0) 102 | -------------------------------------------------------------------------------- /iyp/crawlers/emileaben/README.md: -------------------------------------------------------------------------------- 1 | # EmileAben's AS Names -- https://github.com/emileaben/asnames 2 | 3 | Data collected by EmileAben's AS Names, including: 4 | 5 | - AS numbers 6 | - AS names 7 | 8 | ## Graph representation 9 | 10 | ### AS names 11 | 12 | Connect AS to names nodes, providing the name of an AS. 13 | For example: 14 | 15 | ```Cypher 16 | (:AS {asn: 2497})-[:NAME]-(:Name {name: 'IIJ'}) 17 | ``` 18 | 19 | ## Dependence 20 | 21 | This crawler is not depending on other crawlers. 22 | -------------------------------------------------------------------------------- /iyp/crawlers/emileaben/as_names.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import sys 5 | import tempfile 6 | 7 | import requests 8 | 9 | from iyp import BaseCrawler, get_commit_datetime 10 | 11 | # Organization name and URL to data 12 | ORG = 'emileaben' 13 | URL = 'https://raw.githubusercontent.com/emileaben/asnames/main/asnames.csv' 14 | NAME = 'emileaben.as_names' # should reflect the directory and name of this file 15 | 16 | 17 | class Crawler(BaseCrawler): 18 | # Base Crawler provides access to IYP via self.iyp 19 | # and setup a dictionary with the org/url/today's date in self.reference 20 | def __init__(self, organization, url, name): 21 | super().__init__(organization, url, name) 22 | self.reference['reference_url_info'] = 'https://github.com/emileaben/asnames' 23 | self.reference['reference_time_modification'] = get_commit_datetime('emileaben/asnames', 'asnames.csv') 24 | 25 | def run(self): 26 | # Create a temporary directory 27 | tmpdir = tempfile.mkdtemp() 28 | filename = os.path.join(tmpdir, 'as_names.txt') 29 | 30 | res = requests.get(URL) 31 | res.raise_for_status() 32 | 33 | with open(filename, 'w') as file: 34 | file.write(res.text) 35 | 36 | lines = [] 37 | asns = set() 38 | as_names = set() 39 | 40 | with open(filename, 'r') as file: 41 | for line in file: 42 | line = line.strip() 43 | values = line.split('|') 44 | as_number = values[0] 45 | as_name = values[2] 46 | asns.add(int(as_number)) 47 | as_names.add(as_name) 48 | lines.append(values) 49 | 50 | asns_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False) 51 | as_names_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', as_names, all=False) 52 | 53 | links = [] 54 | 55 | for line in lines: 56 | asn_qid = asns_id[int(line[0])] 57 | as_name_qid = as_names_id[line[2]] 58 | links.append( 59 | {'src_id': asn_qid, 'dst_id': as_name_qid, 'props': [self.reference, {'contributor': line[1]}]}) 60 | 61 | # Push all links to IYP 62 | self.iyp.batch_add_links('NAME', links) 63 | 64 | def unit_test(self): 65 | return super().unit_test(['NAME']) 66 | 67 | 68 | def main() -> None: 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument('--unit-test', action='store_true') 71 | args = parser.parse_args() 72 | 73 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 74 | logging.basicConfig( 75 | format=FORMAT, 76 | filename='log/' + NAME + '.log', 77 | level=logging.INFO, 78 | datefmt='%Y-%m-%d %H:%M:%S' 79 | ) 80 | 81 | logging.info(f'Started: {sys.argv}') 82 | 83 | crawler = Crawler(ORG, URL, NAME) 84 | if args.unit_test: 85 | crawler.unit_test() 86 | else: 87 | crawler.run() 88 | crawler.close() 89 | logging.info(f'Finished: {sys.argv}') 90 | 91 | 92 | if __name__ == '__main__': 93 | main() 94 | sys.exit(0) 95 | -------------------------------------------------------------------------------- /iyp/crawlers/example/README.md: -------------------------------------------------------------------------------- 1 | # Example Crawler 2 | 3 | This is an example of crawler where nodes/relationships are pushed in batches. It is not 4 | a working example but can be used as a template. For a simple working example see 5 | [here](../bgpkit/pfx2asn.py). 6 | 7 | While there are methods to get/create individual nodes they should only be used in rare 8 | cases, as batch creation is almost always faster. 9 | 10 | The first paragraph of this readme should be a description of the dataset and give an 11 | overview of the parts we push to IYP. 12 | 13 | ## Graph representation 14 | 15 | Connect AS nodes to EXAMPLE_NODE_LABEL with EXAMPLE_RELATIONSHIP_LABEL relationship. 16 | 17 | ```cypher 18 | (:AS)-[:EXAMPLE_RELATIONSHIP_LABEL]-(:EXAMPLE_NODE_LABEL) 19 | ``` 20 | 21 | ## Dependence 22 | 23 | This crawler is not depending on other crawlers. 24 | -------------------------------------------------------------------------------- /iyp/crawlers/google/README.md: -------------------------------------------------------------------------------- 1 | # Google CrUX -- https://developer.chrome.com/docs/crux 2 | 3 | The Chrome User Experience Report (CrUX for short) is a dataset collected by 4 | Google that reflects how real-world Chrome users experience popular 5 | destinations on the web. 6 | 7 | CrUX data is collected from real browsers around the world, based on certain 8 | browser options which determine user eligibility. A set of dimensions and metrics 9 | are collected which allow site owners to determine how users experience their sites. 10 | 11 | IYP fetches CrUX's [top 1M popular websites per country](https://github.com/InternetHealthReport/crux-top-lists-country). 12 | Unlike others, CrUX rankings are bucketed by rank magnitude order, not by 13 | specific rank. For example, ranks are 1000, 10k, 100k, or 1M. 14 | 15 | In addition, CrUX ranks *origins* (e.g., https://www.google.com), not domain 16 | or host names. In IYP we extract the hostname part of the origin and model this 17 | dataset using the hostname. 18 | 19 | ## Graph representation 20 | 21 | ```cypher 22 | (:HostName {name:'www.iij.ad.jp'})-[:RANK {rank: 50000, origin:'https://www.iij.ad.jp'}]-(r:Ranking {name:'CrUX top 1M (JP)'})-[:COUNTRY]-(:Country {country_code:'JP'}) 23 | ``` 24 | 25 | The `RANK` relationship contains the property `origin` to recover the origin 26 | given in the original dataset. 27 | 28 | ## Dependence 29 | 30 | This crawler is not depending on other crawlers. 31 | -------------------------------------------------------------------------------- /iyp/crawlers/iana/README.md: -------------------------------------------------------------------------------- 1 | # IANA -- https://www.iana.org/ 2 | 3 | The Internet Assigned Numbers Authority (IANA) is responsible for the global 4 | coordination of the DNS Root, IP addressing, and other Internet protocol resources. 5 | 6 | Datasets used by IYP: 7 | 8 | - DNS [root zone file](https://www.iana.org/domains/root/files) to retrieve information 9 | about authoritative name servers of the top-level domains as well as their IP 10 | addresses. 11 | 12 | ## Graph representation 13 | 14 | ### Root zone file - `root_zone.py` 15 | 16 | IYP imports `NS`, `A`, and `AAAA` records from the root zone file. 17 | 18 | ```Cypher 19 | // NS record 20 | (:DomainName {name: 'jp'})-[:MANAGED_BY]->(:DomainName:AuthoritativeNameServer {name: 'a.dns.jp'}) 21 | // A record 22 | (:DomainName:AuthoritativeNameServer {name: 'a.dns.jp'})-[:RESOLVES_TO]->(:IP {ip: '203.119.1.1'}) 23 | // AAAA record 24 | (:DomainName:AuthoritativeNameServer {name: 'a.dns.jp'})-[:RESOLVES_TO]->(:IP {ip: '2001:dc4::1'}) 25 | ``` 26 | -------------------------------------------------------------------------------- /iyp/crawlers/ihr/README.md: -------------------------------------------------------------------------------- 1 | # Internet Health Report -- https://ihr.iijlab.net/ 2 | 3 | Data inferred by IIJ's Internet Health Report, including: 4 | 5 | - AS inter-dependency 6 | - country's AS dependency 7 | - prefixes' RPKI and IRR status 8 | 9 | #### Country's AS dependency 10 | The way to retrieve the country AS dependency values shown on IHR website (e.g. https://ihr.iijlab.net/ihr/en-us/countries/JP) is as follow. 11 | For eyeball ranking nodes get `hege` and `weight` values from the corresponding RANK relationship and then: 12 | - Population Total = 100*`hege` 13 | - Population Direct = `weight` 14 | - Population Indirect = 100*`hege`-`weight` 15 | 16 | For AS ranking nodes get `hege` values from the corresponding RANK relationship and then: 17 | - AS Total = 100*`hege` 18 | 19 | The values are not exactly the same as the ones shown on the IHR website because the IHR website averages results over three days. 20 | 21 | 22 | 23 | ## Graph representation 24 | 25 | ### AS inter-dependency - `local_hegemony.py` 26 | 27 | Connect ASes that are depending on each other, meaning that an AS is commonly found on routes 28 | towards an origin AS. The strength of the dependence is given by the `hege` link attribute (AS 29 | Hegemony value) which range from 0 to 1. Strongest dependencies being equal to 1. 30 | 31 | ```Cypher 32 | (:AS {asn: 2907})-[:DEPENDS_ON {hege: 0.82}]-(:AS {asn: 2497}) 33 | ``` 34 | 35 | ### Country's AS dependency - `country_dependency.py` 36 | 37 | Connect ASes to ranking nodes which are also connected to a country. A Country AS dependency is 38 | computed in two different ways, emphasizing either the distribution of the country's population 39 | (a.k.a. Total eyeball) or the country ASes (a.k.a. Total AS), for example: 40 | 41 | ```Cypher 42 | (:AS {asn: 2497})-[:RANK {rank: 1, hege: 0.19}]-> 43 | (:Ranking {name: 'IHR country ranking: Total AS (JP)'})-[:COUNTRY]-> 44 | (:Country {country_code: 'JP'}) 45 | ``` 46 | 47 | means that Japan ASes depends strongly (AS Hegemony equals 0.19) on AS2497. 48 | 49 | ### Prefixes' RPKI and IRR status - `rov.py` 50 | 51 | Connect prefixes to their origin AS, their AS dependencies, their RPKI/IRR status, and their country 52 | (provided by Maxmind). 53 | 54 | ```Cypher 55 | (:BGPPrefix {prefix: '8.8.8.0/24'})<-[:ORIGINATE]-(:AS {asn: 15169}) 56 | (:BGPPrefix {prefix: '8.8.8.0/24'})-[:DEPENDS_ON]->(:AS {asn: 15169}) 57 | (:BGPPrefix {prefix: '8.8.8.0/24'})-[:CATEGORIZED]->(:Tag {label: 'RPKI Valid'}) 58 | (:BGPPrefix {prefix: '8.8.8.0/24'})-[:COUNTRY]->(:Country {country_code: 'US'}) 59 | ``` 60 | 61 | Tag labels (possibly) added by this crawler: 62 | 63 | - `RPKI Valid` 64 | - `RPKI Invalid` 65 | - `RPKI Invalid,more-specific` 66 | - `RPKI NotFound` 67 | - `IRR Valid` 68 | - `IRR Invalid` 69 | - `IRR Invalid,more-specific` 70 | - `IRR NotFound` 71 | 72 | The country geo-location is provided by Maxmind. 73 | 74 | ## Dependence 75 | 76 | These crawlers are not depending on other crawlers. 77 | -------------------------------------------------------------------------------- /iyp/crawlers/ihr/__init__.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import io 3 | import logging 4 | from datetime import datetime, timedelta, timezone 5 | 6 | import lz4.frame 7 | import requests 8 | 9 | from iyp import BaseCrawler, DataNotAvailableError 10 | 11 | 12 | class HegemonyCrawler(BaseCrawler): 13 | def __init__(self, organization, url, name, af): 14 | self.af = af 15 | super().__init__(organization, url, name) 16 | self.reference['reference_url_info'] = 'https://ihr.iijlab.net/ihr/en-us/documentation#AS_dependency' 17 | 18 | def run(self): 19 | """Fetch data from file and push to IYP.""" 20 | 21 | today = datetime.now(tz=timezone.utc) 22 | max_lookback = today - timedelta(days=7) 23 | url = today.strftime(self.url) 24 | req = requests.head(url) 25 | while req.status_code != 200 and today > max_lookback: 26 | today -= timedelta(days=1) 27 | url = today.strftime(self.url) 28 | req = requests.head(url) 29 | if req.status_code != 200: 30 | logging.error('Failed to find data within the specified lookback interval.') 31 | raise DataNotAvailableError('Failed to find data within the specified lookback interval.') 32 | 33 | self.reference['reference_url_data'] = url 34 | 35 | logging.info(f'Fetching data from: {url}') 36 | req = requests.get(url) 37 | req.raise_for_status() 38 | 39 | # lz4.frame.decompress() and splitlines() break the CSV parsing due to some 40 | # weird input. 41 | with lz4.frame.open(io.BytesIO(req.content)) as f: 42 | csv_lines = [l.decode('utf-8').rstrip() for l in f] 43 | 44 | timebin = None 45 | asns = set() 46 | links = list() 47 | 48 | logging.info('Computing links...') 49 | for rec in csv.DictReader(csv_lines): 50 | # header 51 | # timebin,originasn,asn,hege 52 | 53 | rec['hege'] = float(rec['hege']) 54 | rec['af'] = self.af 55 | 56 | if timebin is None: 57 | timebin = rec['timebin'] 58 | mod_time = datetime.strptime(timebin, '%Y-%m-%d %H:%M:%S+00').replace(tzinfo=timezone.utc) 59 | self.reference['reference_time_modification'] = mod_time 60 | elif timebin != rec['timebin']: 61 | break 62 | 63 | originasn = int(rec['originasn']) 64 | asn = int(rec['asn']) 65 | asns.add(originasn) 66 | asns.add(asn) 67 | 68 | links.append({ 69 | 'src_id': originasn, 70 | 'dst_id': asn, 71 | 'props': [self.reference, rec] 72 | }) 73 | 74 | asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False) 75 | # Replace values in links with node IDs. 76 | for link in links: 77 | link['src_id'] = asn_id[link['src_id']] 78 | link['dst_id'] = asn_id[link['dst_id']] 79 | 80 | # Push links to IYP 81 | self.iyp.batch_add_links('DEPENDS_ON', links) 82 | 83 | def unit_test(self): 84 | return super().unit_test(['DEPENDS_ON']) 85 | -------------------------------------------------------------------------------- /iyp/crawlers/ihr/local_hegemony_v4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.ihr import HegemonyCrawler 6 | 7 | # URL to the API 8 | URL = 'https://archive.ihr.live/ihr/hegemony/ipv4/local/%Y/%m/%d/ihr_hegemony_ipv4_local_%Y-%m-%d.csv.lz4' 9 | ORG = 'IHR' 10 | NAME = 'ihr.local_hegemony_v4' 11 | 12 | 13 | class Crawler(HegemonyCrawler): 14 | def __init__(self, organization, url, name): 15 | super().__init__(organization, url, name, af=4) 16 | 17 | 18 | def main() -> None: 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--unit-test', action='store_true') 21 | args = parser.parse_args() 22 | 23 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 24 | logging.basicConfig( 25 | format=FORMAT, 26 | filename='log/' + NAME + '.log', 27 | level=logging.INFO, 28 | datefmt='%Y-%m-%d %H:%M:%S' 29 | ) 30 | 31 | logging.info(f'Started: {sys.argv}') 32 | 33 | crawler = Crawler(ORG, URL, NAME) 34 | if args.unit_test: 35 | crawler.unit_test() 36 | else: 37 | crawler.run() 38 | crawler.close() 39 | logging.info(f'Finished: {sys.argv}') 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | sys.exit(0) 45 | -------------------------------------------------------------------------------- /iyp/crawlers/ihr/local_hegemony_v6.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.ihr import HegemonyCrawler 6 | 7 | # URL to the API 8 | URL = 'https://archive.ihr.live/ihr/hegemony/ipv6/local/%Y/%m/%d/ihr_hegemony_ipv6_local_%Y-%m-%d.csv.lz4' 9 | ORG = 'IHR' 10 | NAME = 'ihr.local_hegemony_v6' 11 | 12 | 13 | class Crawler(HegemonyCrawler): 14 | def __init__(self, organization, url, name): 15 | super().__init__(organization, url, name, af=6) 16 | 17 | 18 | def main() -> None: 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--unit-test', action='store_true') 21 | args = parser.parse_args() 22 | 23 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 24 | logging.basicConfig( 25 | format=FORMAT, 26 | filename='log/' + NAME + '.log', 27 | level=logging.INFO, 28 | datefmt='%Y-%m-%d %H:%M:%S' 29 | ) 30 | 31 | logging.info(f'Started: {sys.argv}') 32 | 33 | crawler = Crawler(ORG, URL, NAME) 34 | if args.unit_test: 35 | crawler.unit_test() 36 | else: 37 | crawler.run() 38 | crawler.close() 39 | logging.info(f'Finished: {sys.argv}') 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | sys.exit(0) 45 | -------------------------------------------------------------------------------- /iyp/crawlers/inetintel/README.md: -------------------------------------------------------------------------------- 1 | # Internet Intelligence Lab - Dataset: AS to Organization mapping -- https://github.com/InetIntel/Dataset-AS-to-Organization-Mapping 2 | 3 | The dataset contains historical and current versions of the AS to Organization 4 | mapping datasets. A mapping will be created between AS to its sibling ASes. 5 | 6 | ## Graph representation 7 | 8 | ### Sibling ASes 9 | Connect ASes that are managed by the same organization. 10 | ```cypher 11 | (a:AS {asn: 2497})-[:SIBLING_OF]->(b:AS) 12 | ``` 13 | 14 | ### Sibling organizations 15 | ```cypher 16 | (a:Organization {name: 'NTT Communications Corporation'})-[:SIBLING_OF]->(b:Organization {name: 'NTT Communications (N-BONE)'}) 17 | ``` 18 | 19 | ## Dependence 20 | 21 | This crawler assumes PeeringDB organizations are already present. 22 | -------------------------------------------------------------------------------- /iyp/crawlers/ipinfo/README.md: -------------------------------------------------------------------------------- 1 | # IPinfo -- https://ipinfo.io/ 2 | 3 | IPinfo is an IP geolocation service, that provides different kinds of IP databases, 4 | including a [free tier](https://ipinfo.io/products/free-ip-database) that maps IP ranges 5 | to countries. We import the free database into IYP. 6 | 7 | ## Graph representation 8 | 9 | Since the IP ranges are not necessarily CIDR aligned, we decompose unaligned ranges 10 | into their CIDR-equivalent blocks. The original range is retained in the `start_ip` and 11 | `end_ip` properties of the `COUNTRY` relationship. 12 | 13 | A range can also be just a single IP, resulting in /32 or /128 prefixes, which is 14 | intended. 15 | 16 | ```cypher 17 | (:GeoPrefix {prefix: '203.180.224.0/19'})-[:COUNTRY {start_ip: '203.180.204.28', end_ip: '203.181.102.41'}]->(:Country {country_code: 'JP'}) 18 | ``` 19 | 20 | ## Dependence 21 | 22 | This crawler is not depending on other crawlers. 23 | -------------------------------------------------------------------------------- /iyp/crawlers/ipinfo/ip_country.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gzip 3 | import json 4 | import logging 5 | import os 6 | import sys 7 | from ipaddress import ip_address, summarize_address_range 8 | 9 | import requests 10 | 11 | from iyp import BaseCrawler, set_modification_time_from_last_modified_header 12 | 13 | ORG = 'IPinfo' 14 | URL = 'https://ipinfo.io/data/free/country.json.gz' 15 | NAME = 'ipinfo.ip_country' 16 | 17 | IPINFO_TOKEN = '' 18 | if os.path.exists('config.json'): 19 | IPINFO_TOKEN = json.load(open('config.json', 'r'))['ipinfo']['token'] 20 | 21 | 22 | class Crawler(BaseCrawler): 23 | def __init__(self, organization, url, name): 24 | super().__init__(organization, url, name) 25 | self.reference['reference_url_info'] = 'https://ipinfo.io/products/free-ip-database' 26 | 27 | def run(self): 28 | """Fetch data and push to IYP.""" 29 | 30 | headers = {'Authorization': f'Bearer {IPINFO_TOKEN}'} 31 | req = requests.get(self.reference['reference_url_data'], headers=headers) 32 | req.raise_for_status() 33 | 34 | set_modification_time_from_last_modified_header(self.reference, req) 35 | rows = gzip.decompress(req.content) 36 | 37 | countries = set() 38 | prefixes = set() 39 | links = list() 40 | 41 | for row in rows.splitlines(): 42 | doc = json.loads(row) 43 | start, end = ip_address(doc['start_ip']), ip_address(doc['end_ip']) 44 | for prefix in summarize_address_range(start, end): 45 | country_code = doc['country'] 46 | prefix = prefix.compressed 47 | countries.add(country_code) 48 | prefixes.add(prefix) 49 | links.append({'src_id': prefix, 'dst_id': country_code, 'props': [self.reference, doc]}) 50 | 51 | country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', countries, all=False) 52 | prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes, all=False) 53 | self.iyp.batch_add_node_label(list(prefix_id.values()), 'GeoPrefix') 54 | 55 | for link in links: 56 | link['src_id'] = prefix_id[link['src_id']] 57 | link['dst_id'] = country_id[link['dst_id']] 58 | 59 | self.iyp.batch_add_links('COUNTRY', links) 60 | 61 | def unit_test(self): 62 | return super().unit_test(['COUNTRY']) 63 | 64 | 65 | def main() -> None: 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument('--unit-test', action='store_true') 68 | args = parser.parse_args() 69 | 70 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 71 | logging.basicConfig( 72 | format=FORMAT, 73 | filename='log/' + NAME + '.log', 74 | level=logging.INFO, 75 | datefmt='%Y-%m-%d %H:%M:%S', 76 | ) 77 | 78 | logging.info(f'Started: {sys.argv}') 79 | 80 | crawler = Crawler(ORG, URL, NAME) 81 | if args.unit_test: 82 | crawler.unit_test() 83 | else: 84 | crawler.run() 85 | crawler.close() 86 | logging.info(f'Finished: {sys.argv}') 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | sys.exit(0) 92 | -------------------------------------------------------------------------------- /iyp/crawlers/manrs/README.md: -------------------------------------------------------------------------------- 1 | # MANRS -- https://www.manrs.org/ 2 | 3 | Mutually Agreed Norms for Routing Security (MANRS) is an initiative to improve the security and 4 | resilience of the Internet’s global routing system. It does this by encouraging those running BGP to 5 | implement well-established industry best practices and technological solutions that can address the 6 | most common threats. 7 | 8 | A network operator can become a MANRS member by implementing *Actions* that are further described 9 | [here](https://www.manrs.org/netops/network-operator-actions/). Currently there are four actions: 10 | 11 | 1. Filtering: Prevent propagation of incorrect routing information 12 | 1. Anti-spoofing: Prevent traffic with spoofed source IP addresses 13 | 1. Coordination: Facilitate global operational communication and coordination 14 | 1. Global Validation: Facilitate routing information on a global scale 15 | 16 | IYP contains information about the membership status of networks (in form of AS nodes) and which 17 | actions are implemented by each member. The country assignment provided by MANRS is also used to 18 | enhance the existing AS-to-Country mappings. 19 | 20 | ## Graph representation 21 | 22 | ```Cypher 23 | (:AS {asn: 2497})-[:MEMBER_OF]->(:Organization {name: 'MANRS'}) 24 | (:AS {asn: 2497})-[:IMPLEMENT]->(:ManrsAction {label: 'MANRS Action 1: Filtering'}) 25 | (:AS {asn: 2497})-[:COUNTRY]->(:Country {country_code: 'JP'}) 26 | ``` 27 | 28 | Possible labels for ManrsAction nodes: 29 | 30 | - `MANRS Action 1: Filtering` 31 | - `MANRS Action 2: Anti-spoofing` 32 | - `MANRS Action 3: Coordination` 33 | - `MANRS Action 4: Global Validation` 34 | 35 | ## Dependence 36 | 37 | This crawler is not depending on other crawlers. 38 | -------------------------------------------------------------------------------- /iyp/crawlers/nro/README.md: -------------------------------------------------------------------------------- 1 | # Number Resource Organization -- https://www.nro.net/ 2 | 3 | The Number Resource Organization (NRO) is the coordinating body for the world’s Regional Internet 4 | Registries (RIRs). The RIRs manage the distribution of Internet number resources (IP address space 5 | and Autonomous System Numbers) within their respective regions. 6 | 7 | As part of a joint RIR project to provide consistent and accessible Internet number resource 8 | statistics the NRO publishes [*Extended Allocation and Assignment 9 | Reports*](https://www.nro.net/about/rirs/statistics/) (also called *delegated stats*) that contain 10 | information about assigned IP address ranges and AS numbers. 11 | 12 | Each line of the report is a record that either represents an IP address range or an AS number. The 13 | record has a status and maps to an *opaque ID* that uniquely identifies a single organization. 14 | Finally, the record contains a country code to which the organization belongs. 15 | 16 | **Note:** If the record is not assigned, the country code is `ZZ`, which will still be inserted into 17 | IYP. 18 | 19 | ## Graph representation 20 | 21 | ```Cypher 22 | (:AS {asn: 7494})-[:AVAILABLE {registry: 'apnic'}]->(:OpaqueID {id: 'apnic'}) 23 | (:AS {asn: 2497})-[:ASSIGNED {registry: 'apnic'}]->(:OpaqueID {id: 'A91A7381'}) 24 | (:AS {asn: 608})-[:RESERVED {registry: 'arin'}]->(:OpaqueID {id: 'arin'}) 25 | (:AS {asn: 2497})-[:COUNTRY]->(:Country {country_code: 'JP'}) 26 | 27 | (:RIRPrefix {prefix: '2a03:1dc0::/27'})-[:AVAILABLE {registry: 'ripencc'}]->(:OpaqueID {id: 'ripencc'}) 28 | (:RIRPrefix {prefix: '202.0.65.0/24'})-[:ASSIGNED {registry: 'apnic'}]->(:OpaqueID {id: 'A91A7381'}) 29 | (:RIRPrefix {prefix: '196.20.32.0/19'})-[:RESERVED {registry: 'afrinic'}]->(:OpaqueID {id: 'afrinic'}) 30 | (:RIRPrefix {prefix: '196.20.32.0/19'})-[:COUNTRY]->(:Country {country_code: 'ZZ'}) 31 | ``` 32 | 33 | The report also contains `allocated` records that would result in a `ALLOCATED` relationship. 34 | However, this crawler does not add ASes, so if the AS node was not created by another crawler, which 35 | should not happen for `allocated` ASes, the relationship is not created. 36 | 37 | The IPv4 address ranges in the report are not necessarily aligned with CIDR ranges (prefixes are 38 | represented by the first IP and a *count of hosts*). The crawler decomposes these IP blocks into 39 | valid CIDR prefixes if necessary. 40 | 41 | ## Dependence 42 | 43 | This crawler does not create new AS nodes and should be run after crawlers that push many AS nodes 44 | (e.g., `ripe.as_names`). 45 | -------------------------------------------------------------------------------- /iyp/crawlers/ooni/facebookmessenger.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from collections import defaultdict 5 | 6 | from iyp.crawlers.ooni import OoniCrawler 7 | 8 | ORG = 'OONI' 9 | URL = 's3://ooni-data-eu-fra/raw/' 10 | NAME = 'ooni.facebookmessenger' 11 | 12 | label = 'OONI Facebook Messenger Test' 13 | 14 | 15 | class Crawler(OoniCrawler): 16 | 17 | def __init__(self, organization, url, name): 18 | super().__init__(organization, url, name, 'facebookmessenger') 19 | self.categories = ['unblocked', 'dns_blocking', 'tcp_blocking', 'both_blocked'] 20 | 21 | # Process a single line from the jsonl file and store the results locally 22 | def process_one_line(self, one_line): 23 | if super().process_one_line(one_line): 24 | return 25 | result_dns = one_line['test_keys'].get('facebook_dns_blocking', None) 26 | result_tcp = one_line['test_keys'].get('facebook_tcp_blocking', None) 27 | if result_dns is None or result_tcp is None: 28 | self.all_results.pop() 29 | return 30 | 31 | # Using the last result from the base class, add our unique variables 32 | self.all_results[-1] = self.all_results[-1] + (result_dns, result_tcp) 33 | 34 | def batch_add_to_iyp(self): 35 | super().batch_add_to_iyp() 36 | 37 | facebookmessenger_id = self.iyp.get_node('Tag', {'label': label}, create=True) 38 | 39 | censored_links = list() 40 | 41 | # Create one link per ASN-country pair. 42 | for (asn, country), result_dict in self.all_percentages.items(): 43 | asn_id = self.node_ids['asn'][asn] 44 | props = dict() 45 | for category in self.categories: 46 | props[f'percentage_{category}'] = result_dict['percentages'][category] 47 | props[f'count_{category}'] = result_dict['category_counts'][category] 48 | props['total_count'] = result_dict['total_count'] 49 | props['country_code'] = country 50 | censored_links.append( 51 | {'src_id': asn_id, 'dst_id': facebookmessenger_id, 'props': [props, self.reference]} 52 | ) 53 | 54 | self.iyp.batch_add_links('CENSORED', censored_links) 55 | 56 | def aggregate_results(self): 57 | target_dict = defaultdict(lambda: defaultdict(int)) 58 | 59 | for entry in self.all_results: 60 | asn, country, result_dns, result_tcp = entry 61 | if not result_dns and not result_tcp: 62 | target_dict[(asn, country)]['unblocked'] += 1 63 | elif result_dns and not result_tcp: 64 | target_dict[(asn, country)]['dns_blocking'] += 1 65 | elif not result_dns and result_tcp: 66 | target_dict[(asn, country)]['tcp_blocking'] += 1 67 | elif result_dns and result_tcp: 68 | target_dict[(asn, country)]['both_blocked'] += 1 69 | 70 | for (asn, country), counts in target_dict.items(): 71 | self.all_percentages[(asn, country)] = self.make_result_dict(counts) 72 | 73 | def unit_test(self): 74 | return super().unit_test(['CENSORED']) 75 | 76 | 77 | def main() -> None: 78 | parser = argparse.ArgumentParser() 79 | parser.add_argument('--unit-test', action='store_true') 80 | args = parser.parse_args() 81 | 82 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 83 | logging.basicConfig( 84 | format=FORMAT, 85 | filename='log/' + NAME + '.log', 86 | level=logging.INFO, 87 | datefmt='%Y-%m-%d %H:%M:%S', 88 | ) 89 | 90 | logging.info(f'Started: {sys.argv}') 91 | 92 | crawler = Crawler(ORG, URL, NAME) 93 | if args.unit_test: 94 | crawler.unit_test() 95 | else: 96 | crawler.run() 97 | crawler.close() 98 | logging.info(f'Finished: {sys.argv}') 99 | 100 | 101 | if __name__ == '__main__': 102 | main() 103 | sys.exit(0) 104 | -------------------------------------------------------------------------------- /iyp/crawlers/ooni/httpinvalidrequestline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from collections import defaultdict 5 | 6 | from iyp.crawlers.ooni import OoniCrawler 7 | 8 | ORG = 'OONI' 9 | URL = 's3://ooni-data-eu-fra/raw/' 10 | NAME = 'ooni.httpinvalidrequestline' 11 | 12 | label = 'OONI HTTP Invalid Request Line Test' 13 | 14 | 15 | class Crawler(OoniCrawler): 16 | 17 | def __init__(self, organization, url, name): 18 | super().__init__(organization, url, name, 'httpinvalidrequestline') 19 | self.categories = ['tampering', 'no_tampering'] 20 | 21 | def process_one_line(self, one_line): 22 | """Process a single line from the jsonl file and store the results locally.""" 23 | if super().process_one_line(one_line): 24 | return 25 | 26 | tampering = 'tampering' if one_line['test_keys']['tampering'] else 'no_tampering' 27 | 28 | # Using the last result from the base class, add our unique variables 29 | self.all_results[-1] = self.all_results[-1] + (tampering,) 30 | 31 | def batch_add_to_iyp(self): 32 | super().batch_add_to_iyp() 33 | 34 | httpinvalidrequestline_id = self.iyp.get_node('Tag', {'label': label}, create=True) 35 | 36 | censored_links = list() 37 | 38 | # Create one link per ASN-country pair. 39 | for (asn, country), result_dict in self.all_percentages.items(): 40 | asn_id = self.node_ids['asn'][asn] 41 | props = dict() 42 | for category in self.categories: 43 | props[f'percentage_{category}'] = result_dict['percentages'][category] 44 | props[f'count_{category}'] = result_dict['category_counts'][category] 45 | props['total_count'] = result_dict['total_count'] 46 | props['country_code'] = country 47 | censored_links.append( 48 | {'src_id': asn_id, 'dst_id': httpinvalidrequestline_id, 'props': [props, self.reference]} 49 | ) 50 | 51 | self.iyp.batch_add_links('CENSORED', censored_links) 52 | 53 | def aggregate_results(self): 54 | target_dict = defaultdict(lambda: defaultdict(int)) 55 | 56 | # Populate the target_dict with counts 57 | for entry in self.all_results: 58 | asn, country, tampering = entry 59 | target_dict[(asn, country)][tampering] += 1 60 | 61 | for (asn, country), counts in target_dict.items(): 62 | self.all_percentages[(asn, country)] = self.make_result_dict(counts) 63 | 64 | def unit_test(self): 65 | return super().unit_test(['CENSORED']) 66 | 67 | 68 | def main() -> None: 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument('--unit-test', action='store_true') 71 | args = parser.parse_args() 72 | 73 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 74 | logging.basicConfig( 75 | format=FORMAT, 76 | filename='log/' + NAME + '.log', 77 | level=logging.INFO, 78 | datefmt='%Y-%m-%d %H:%M:%S', 79 | ) 80 | 81 | logging.info(f'Started: {sys.argv}') 82 | 83 | crawler = Crawler(ORG, URL, NAME) 84 | if args.unit_test: 85 | crawler.unit_test() 86 | else: 87 | crawler.run() 88 | crawler.close() 89 | logging.info(f'Finished: {sys.argv}') 90 | 91 | 92 | if __name__ == '__main__': 93 | main() 94 | sys.exit(0) 95 | -------------------------------------------------------------------------------- /iyp/crawlers/ooni/osignal.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from collections import defaultdict 5 | 6 | from iyp.crawlers.ooni import OoniCrawler 7 | 8 | ORG = 'OONI' 9 | URL = 's3://ooni-data-eu-fra/raw/' 10 | # This crawler is not called 'signal' to prevent name collision with Python's built-in 11 | # module. 12 | NAME = 'ooni.osignal' 13 | 14 | label = 'OONI Signal Test' 15 | 16 | 17 | class Crawler(OoniCrawler): 18 | 19 | def __init__(self, organization, url, name): 20 | super().__init__(organization, url, name, 'signal') 21 | self.categories = ['ok', 'blocked'] 22 | 23 | def process_one_line(self, one_line): 24 | """Process a single line from the jsonl file and store the results locally.""" 25 | if super().process_one_line(one_line): 26 | return 27 | signal_backend_status = one_line['test_keys']['signal_backend_status'] 28 | if signal_backend_status is None: 29 | self.all_results.pop() 30 | return 31 | 32 | # Using the last result from the base class, add our unique variables 33 | self.all_results[-1] = self.all_results[-1] + (signal_backend_status,) 34 | 35 | def batch_add_to_iyp(self): 36 | super().batch_add_to_iyp() 37 | 38 | signal_id = self.iyp.get_node('Tag', {'label': label}, create=True) 39 | 40 | censored_links = list() 41 | 42 | # Create one link per ASN-country pair. 43 | for (asn, country), result_dict in self.all_percentages.items(): 44 | asn_id = self.node_ids['asn'][asn] 45 | props = dict() 46 | for category in self.categories: 47 | props[f'percentage_{category}'] = result_dict['percentages'][category] 48 | props[f'count_{category}'] = result_dict['category_counts'][category] 49 | props['total_count'] = result_dict['total_count'] 50 | props['country_code'] = country 51 | censored_links.append( 52 | {'src_id': asn_id, 'dst_id': signal_id, 'props': [props, self.reference]} 53 | ) 54 | 55 | self.iyp.batch_add_links('CENSORED', censored_links) 56 | 57 | def aggregate_results(self): 58 | target_dict = defaultdict(lambda: defaultdict(int)) 59 | 60 | # Populate the target_dict with counts 61 | for entry in self.all_results: 62 | asn, country, result = entry 63 | target_dict[(asn, country)][result] += 1 64 | 65 | for (asn, country), counts in target_dict.items(): 66 | self.all_percentages[(asn, country)] = self.make_result_dict(counts) 67 | 68 | def unit_test(self): 69 | return super().unit_test(['CENSORED']) 70 | 71 | 72 | def main() -> None: 73 | parser = argparse.ArgumentParser() 74 | parser.add_argument('--unit-test', action='store_true') 75 | args = parser.parse_args() 76 | 77 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 78 | logging.basicConfig( 79 | format=FORMAT, 80 | filename='log/' + NAME + '.log', 81 | level=logging.INFO, 82 | datefmt='%Y-%m-%d %H:%M:%S', 83 | ) 84 | 85 | logging.info(f'Started: {sys.argv}') 86 | 87 | crawler = Crawler(ORG, URL, NAME) 88 | if args.unit_test: 89 | crawler.unit_test() 90 | else: 91 | crawler.run() 92 | crawler.close() 93 | logging.info(f'Finished: {sys.argv}') 94 | 95 | 96 | if __name__ == '__main__': 97 | main() 98 | sys.exit(0) 99 | -------------------------------------------------------------------------------- /iyp/crawlers/ooni/psiphon.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from collections import defaultdict 5 | 6 | from iyp.crawlers.ooni import OoniCrawler 7 | 8 | ORG = 'OONI' 9 | URL = 's3://ooni-data-eu-fra/raw/' 10 | NAME = 'ooni.psiphon' 11 | 12 | label = 'OONI Psiphon Test' 13 | 14 | 15 | class Crawler(OoniCrawler): 16 | 17 | def __init__(self, organization, url, name): 18 | super().__init__(organization, url, name, 'psiphon') 19 | self.categories = [ 20 | 'bootstrapping_error', 21 | 'usage_error', 22 | 'working', 23 | 'invalid', 24 | ] 25 | 26 | def process_one_line(self, one_line): 27 | """Process a single line from the jsonl file and store the results locally.""" 28 | if super().process_one_line(one_line): 29 | return 30 | 31 | if 'bootstrap_time' not in one_line['test_keys']: 32 | self.all_results.pop() 33 | return 34 | 35 | bootstrap_time = one_line['test_keys']['bootstrap_time'] 36 | failure = one_line['test_keys']['failure'] 37 | 38 | if bootstrap_time == 0 and failure is not None: 39 | result = 'bootstrapping_error' 40 | elif bootstrap_time > 0 and failure is not None: 41 | result = 'usage_error' 42 | elif bootstrap_time > 0 and failure is None: 43 | result = 'working' 44 | elif bootstrap_time == 0 and failure is None: 45 | result = 'invalid' 46 | 47 | # Using the last result from the base class, add our unique variables 48 | self.all_results[-1] = self.all_results[-1] + (result,) 49 | 50 | def batch_add_to_iyp(self): 51 | super().batch_add_to_iyp() 52 | 53 | psiphon_id = self.iyp.get_node('Tag', {'label': label}, create=True) 54 | 55 | censored_links = list() 56 | 57 | # Create one link per ASN-country pair. 58 | for (asn, country), result_dict in self.all_percentages.items(): 59 | asn_id = self.node_ids['asn'][asn] 60 | props = dict() 61 | for category in self.categories: 62 | props[f'percentage_{category}'] = result_dict['percentages'][category] 63 | props[f'count_{category}'] = result_dict['category_counts'][category] 64 | props['total_count'] = result_dict['total_count'] 65 | props['country_code'] = country 66 | censored_links.append( 67 | {'src_id': asn_id, 'dst_id': psiphon_id, 'props': [props, self.reference]} 68 | ) 69 | 70 | self.iyp.batch_add_links('CENSORED', censored_links) 71 | 72 | def aggregate_results(self): 73 | target_dict = defaultdict(lambda: defaultdict(int)) 74 | 75 | # Populate the target_dict with counts 76 | for entry in self.all_results: 77 | asn, country, result = entry 78 | target_dict[(asn, country)][result] += 1 79 | 80 | for (asn, country), counts in target_dict.items(): 81 | self.all_percentages[(asn, country)] = self.make_result_dict(counts) 82 | 83 | def unit_test(self): 84 | return super().unit_test(['CENSORED']) 85 | 86 | 87 | def main() -> None: 88 | parser = argparse.ArgumentParser() 89 | parser.add_argument('--unit-test', action='store_true') 90 | args = parser.parse_args() 91 | 92 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 93 | logging.basicConfig( 94 | format=FORMAT, 95 | filename='log/' + NAME + '.log', 96 | level=logging.INFO, 97 | datefmt='%Y-%m-%d %H:%M:%S', 98 | ) 99 | 100 | logging.info(f'Started: {sys.argv}') 101 | 102 | crawler = Crawler(ORG, URL, NAME) 103 | if args.unit_test: 104 | crawler.unit_test() 105 | else: 106 | crawler.run() 107 | crawler.close() 108 | logging.info(f'Finished: {sys.argv}') 109 | 110 | 111 | if __name__ == '__main__': 112 | main() 113 | sys.exit(0) 114 | -------------------------------------------------------------------------------- /iyp/crawlers/ooni/riseupvpn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from collections import defaultdict 5 | 6 | from iyp.crawlers.ooni import OoniCrawler 7 | 8 | ORG = 'OONI' 9 | URL = 's3://ooni-data-eu-fra/raw/' 10 | NAME = 'ooni.riseupvpn' 11 | 12 | label = 'OONI RiseupVPN Test' 13 | 14 | 15 | class Crawler(OoniCrawler): 16 | 17 | def __init__(self, organization, url, name): 18 | super().__init__(organization, url, name, 'riseupvpn') 19 | self.categories = ['ok', 'failure'] 20 | 21 | def process_one_line(self, one_line): 22 | """Process a single line from the jsonl file and store the results locally.""" 23 | if super().process_one_line(one_line): 24 | return 25 | 26 | api_failures = one_line['test_keys'].get('api_failures') 27 | ca_cert_status = one_line['test_keys']['ca_cert_status'] 28 | 29 | if not api_failures and ca_cert_status: 30 | result = 'ok' 31 | else: 32 | result = 'failure' 33 | 34 | # Using the last result from the base class, add our unique variables 35 | self.all_results[-1] = self.all_results[-1] + (result,) 36 | 37 | def batch_add_to_iyp(self): 38 | super().batch_add_to_iyp() 39 | 40 | riseupvpn_id = self.iyp.get_node('Tag', {'label': label}, create=True) 41 | 42 | censored_links = list() 43 | 44 | # Create one link per ASN-country pair. 45 | for (asn, country), result_dict in self.all_percentages.items(): 46 | asn_id = self.node_ids['asn'][asn] 47 | props = dict() 48 | for category in self.categories: 49 | props[f'percentage_{category}'] = result_dict['percentages'][category] 50 | props[f'count_{category}'] = result_dict['category_counts'][category] 51 | props['total_count'] = result_dict['total_count'] 52 | props['country_code'] = country 53 | censored_links.append( 54 | {'src_id': asn_id, 'dst_id': riseupvpn_id, 'props': [props, self.reference]} 55 | ) 56 | 57 | self.iyp.batch_add_links('CENSORED', censored_links) 58 | 59 | def aggregate_results(self): 60 | target_dict = defaultdict(lambda: defaultdict(int)) 61 | 62 | # Populate the target_dict with counts 63 | for entry in self.all_results: 64 | asn, country, result = entry 65 | target_dict[(asn, country)][result] += 1 66 | 67 | for (asn, country), counts in target_dict.items(): 68 | self.all_percentages[(asn, country)] = self.make_result_dict(counts) 69 | 70 | def unit_test(self): 71 | return super().unit_test(['CENSORED']) 72 | 73 | 74 | def main() -> None: 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument('--unit-test', action='store_true') 77 | args = parser.parse_args() 78 | 79 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 80 | logging.basicConfig( 81 | format=FORMAT, 82 | filename='log/' + NAME + '.log', 83 | level=logging.INFO, 84 | datefmt='%Y-%m-%d %H:%M:%S', 85 | ) 86 | 87 | logging.info(f'Started: {sys.argv}') 88 | 89 | crawler = Crawler(ORG, URL, NAME) 90 | if args.unit_test: 91 | crawler.unit_test() 92 | else: 93 | crawler.run() 94 | crawler.close() 95 | logging.info(f'Finished: {sys.argv}') 96 | 97 | 98 | if __name__ == '__main__': 99 | main() 100 | sys.exit(0) 101 | -------------------------------------------------------------------------------- /iyp/crawlers/ooni/torsf.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from collections import defaultdict 5 | 6 | from iyp.crawlers.ooni import OoniCrawler 7 | 8 | ORG = 'OONI' 9 | URL = 's3://ooni-data-eu-fra/raw/' 10 | NAME = 'ooni.torsf' 11 | 12 | label = 'OONI Tor Snowflake Test' 13 | 14 | 15 | class Crawler(OoniCrawler): 16 | 17 | def __init__(self, organization, url, name): 18 | super().__init__(organization, url, name, 'torsf') 19 | self.categories = ['ok', 'failure'] 20 | 21 | def process_one_line(self, one_line): 22 | """Process a single line from the JSONL file.""" 23 | if super().process_one_line(one_line): 24 | return 25 | if 'success' not in one_line['test_keys']: 26 | self.all_results.pop() 27 | return 28 | result = 'ok' if one_line['test_keys']['success'] else 'failure' 29 | 30 | # Update the last entry in all_results with the new test-specific data 31 | self.all_results[-1] = self.all_results[-1] + (result,) 32 | 33 | def batch_add_to_iyp(self): 34 | super().batch_add_to_iyp() 35 | 36 | torsf_id = self.iyp.get_node('Tag', {'label': label}, create=True) 37 | 38 | censored_links = list() 39 | 40 | # Create one link per ASN-country pair. 41 | for (asn, country), result_dict in self.all_percentages.items(): 42 | asn_id = self.node_ids['asn'][asn] 43 | props = dict() 44 | for category in self.categories: 45 | props[f'percentage_{category}'] = result_dict['percentages'][category] 46 | props[f'count_{category}'] = result_dict['category_counts'][category] 47 | props['total_count'] = result_dict['total_count'] 48 | props['country_code'] = country 49 | censored_links.append( 50 | {'src_id': asn_id, 'dst_id': torsf_id, 'props': [props, self.reference]} 51 | ) 52 | 53 | self.iyp.batch_add_links('CENSORED', censored_links) 54 | 55 | def aggregate_results(self): 56 | target_dict = defaultdict(lambda: defaultdict(int)) 57 | 58 | # Populate the target_dict with counts 59 | for entry in self.all_results: 60 | asn, country, result = entry 61 | target_dict[(asn, country)][result] += 1 62 | 63 | for (asn, country), counts in target_dict.items(): 64 | self.all_percentages[(asn, country)] = self.make_result_dict(counts) 65 | 66 | def unit_test(self): 67 | return super().unit_test(['CENSORED']) 68 | 69 | 70 | def main() -> None: 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument('--unit-test', action='store_true') 73 | args = parser.parse_args() 74 | 75 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 76 | logging.basicConfig( 77 | format=FORMAT, 78 | filename='log/' + NAME + '.log', 79 | level=logging.INFO, 80 | datefmt='%Y-%m-%d %H:%M:%S', 81 | ) 82 | 83 | logging.info(f'Started: {sys.argv}') 84 | 85 | crawler = Crawler(ORG, URL, NAME) 86 | if args.unit_test: 87 | crawler.unit_test() 88 | else: 89 | crawler.run() 90 | crawler.close() 91 | logging.info(f'Finished: {sys.argv}') 92 | 93 | 94 | if __name__ == '__main__': 95 | main() 96 | sys.exit(0) 97 | -------------------------------------------------------------------------------- /iyp/crawlers/ooni/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternetHealthReport/internet-yellow-pages/c79ced8783c73f282d273a886f4aa5cf8018fdd9/iyp/crawlers/ooni/utils/__init__.py -------------------------------------------------------------------------------- /iyp/crawlers/ooni/utils/grabber.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import gzip 3 | import json 4 | import logging 5 | import os 6 | import shutil 7 | from multiprocessing import Pool 8 | 9 | import boto3 10 | import botocore 11 | 12 | # Global variable required for multiprocessing. 13 | s3 = None 14 | 15 | PARALLEL_DOWNLOADS = 4 16 | if os.path.exists('config.json'): 17 | config = json.load(open('config.json', 'r')) 18 | PARALLEL_DOWNLOADS = config['ooni']['parallel_downloads'] 19 | 20 | 21 | def process(params: tuple): 22 | """Download and extract a single file. 23 | 24 | Args: 25 | params (tuple): Object key and output file path. 26 | """ 27 | key, dest_file = params 28 | # Download the file 29 | try: 30 | s3.download_file(key, dest_file) 31 | except Exception as e: 32 | logging.error(f'Error downloading {key}: {e}') 33 | return 34 | 35 | # Extract the .gz file 36 | try: 37 | extracted_file = dest_file.rstrip('.gz') 38 | with gzip.open(dest_file, 'rb') as f_in, open(extracted_file, 'wb') as f_out: 39 | shutil.copyfileobj(f_in, f_out) 40 | # Delete the .gz file 41 | os.remove(dest_file) 42 | except Exception as e: 43 | logging.error(f'Error extracting {key}: {e}') 44 | 45 | 46 | def download_and_extract(repo: str, tmpdir: str, test_name: str): 47 | """Download the last 7 days of data for the specified test from an S3 bucket into a 48 | temporary directory. 49 | 50 | Args: 51 | repo (str): S3 bucket 52 | tmpdir (str): Output directory 53 | test_name (str): Test name 54 | """ 55 | global s3 56 | # Create an anonymous session 57 | s3 = boto3.resource( 58 | 's3', 59 | region_name='ap-northeast-1', 60 | config=botocore.client.Config( 61 | signature_version=botocore.UNSIGNED 62 | ) 63 | ).Bucket(repo) 64 | 65 | # Get the dates for the last 7 days. 66 | dates = [ 67 | ( 68 | datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=i) 69 | ).strftime('%Y%m%d') 70 | for i in range(7) 71 | ] 72 | 73 | files = list() 74 | 75 | logging.info('Fetching object list...') 76 | # For each day, grab the objects from the S3 bucket. 77 | for date in dates: 78 | date_objects = s3.objects.filter(Prefix=f'raw/{date}/').all() 79 | # Filter for objects from the requested test and only fetch JSONL files. 80 | for object_summary in date_objects: 81 | key = object_summary.key 82 | key_split = key.split('/') 83 | if len(key_split) != 6: 84 | logging.warning(f'Malformed key: {key}') 85 | continue 86 | test = key_split[4] 87 | object_name = key_split[5] 88 | if test != test_name or not object_name.endswith('.jsonl.gz'): 89 | continue 90 | dest_file = os.path.join(tmpdir, object_name) 91 | files.append((key, dest_file)) 92 | 93 | logging.info(f'Fetching {len(files)} objects with {PARALLEL_DOWNLOADS} processes in parallel...') 94 | # Download and extract the files. 95 | with Pool(PARALLEL_DOWNLOADS) as p: 96 | p.map(process, files) 97 | -------------------------------------------------------------------------------- /iyp/crawlers/ooni/vanillator.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from collections import defaultdict 5 | 6 | from iyp.crawlers.ooni import OoniCrawler 7 | 8 | ORG = 'OONI' 9 | URL = 's3://ooni-data-eu-fra/raw/' 10 | NAME = 'ooni.vanillator' 11 | 12 | label = 'OONI Vanilla Tor Test' 13 | 14 | 15 | class Crawler(OoniCrawler): 16 | 17 | def __init__(self, organization, url, name): 18 | super().__init__(organization, url, name, 'vanillator') 19 | self.categories = ['ok', 'failure'] 20 | 21 | def process_one_line(self, one_line): 22 | """Process a single line from the JSONL file.""" 23 | if super().process_one_line(one_line): 24 | return 25 | result = 'ok' if one_line['test_keys']['success'] else 'failure' 26 | 27 | # Update the last entry in all_results with the new test-specific data 28 | self.all_results[-1] = self.all_results[-1] + (result,) 29 | 30 | def batch_add_to_iyp(self): 31 | super().batch_add_to_iyp() 32 | 33 | vanillator_id = self.iyp.get_node('Tag', {'label': label}, create=True) 34 | 35 | censored_links = list() 36 | 37 | # Create one link per ASN-country pair. 38 | for (asn, country), result_dict in self.all_percentages.items(): 39 | asn_id = self.node_ids['asn'][asn] 40 | props = dict() 41 | for category in self.categories: 42 | props[f'percentage_{category}'] = result_dict['percentages'][category] 43 | props[f'count_{category}'] = result_dict['category_counts'][category] 44 | props['total_count'] = result_dict['total_count'] 45 | props['country_code'] = country 46 | censored_links.append( 47 | {'src_id': asn_id, 'dst_id': vanillator_id, 'props': [props, self.reference]} 48 | ) 49 | 50 | self.iyp.batch_add_links('CENSORED', censored_links) 51 | 52 | def aggregate_results(self): 53 | target_dict = defaultdict(lambda: defaultdict(int)) 54 | 55 | # Populate the target_dict with counts 56 | for entry in self.all_results: 57 | asn, country, result = entry 58 | target_dict[(asn, country)][result] += 1 59 | 60 | for (asn, country), counts in target_dict.items(): 61 | self.all_percentages[(asn, country)] = self.make_result_dict(counts) 62 | 63 | def unit_test(self): 64 | return super().unit_test(['CENSORED']) 65 | 66 | 67 | def main() -> None: 68 | parser = argparse.ArgumentParser() 69 | parser.add_argument('--unit-test', action='store_true') 70 | args = parser.parse_args() 71 | 72 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 73 | logging.basicConfig( 74 | format=FORMAT, 75 | filename='log/' + NAME + '.log', 76 | level=logging.INFO, 77 | datefmt='%Y-%m-%d %H:%M:%S', 78 | ) 79 | 80 | logging.info(f'Started: {sys.argv}') 81 | 82 | crawler = Crawler(ORG, URL, NAME) 83 | if args.unit_test: 84 | crawler.unit_test() 85 | else: 86 | crawler.run() 87 | crawler.close() 88 | logging.info(f'Finished: {sys.argv}') 89 | 90 | 91 | if __name__ == '__main__': 92 | main() 93 | sys.exit(0) 94 | -------------------------------------------------------------------------------- /iyp/crawlers/openintel/README.md: -------------------------------------------------------------------------------- 1 | # OpenINTEL -- https://www.openintel.nl/ 2 | 3 | The OpenINTEL measurement platform captures daily snapshots of the state of large parts of the 4 | global Domain Name System (DNS) by running a number of forward and reverse DNS measurements. 5 | 6 | While OpenINTEL runs measurements to a variety of domain names, IYP currently only fetches data for 7 | the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella 8 | top 1 million list since it combines rankings. 9 | IYP also get the list of authoritative names servers seen by OpenINTEL. 10 | 11 | A crawler of mail servers is also implemented but not used as it creates a very large number 12 | of links and this dataset is currently not requested/needed by anyone. 13 | 14 | ## Graph representation 15 | 16 | IP resolution for popular host names: 17 | 18 | ```Cypher 19 | (:HostName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'}) 20 | ``` 21 | 22 | IP resolution of authoritative name servers: 23 | 24 | ```Cypher 25 | (:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'}) 26 | (:IP {ip: '216.239.32.10'})-[:SERVE]->(:Service {name: 'DNS'}) 27 | ``` 28 | 29 | Domain names managed by name servers: 30 | 31 | ```Cypher 32 | (:DomainName {name: 'google.com'})-[:MANAGED_BY]->(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'}) 33 | ``` 34 | 35 | ## Dependence 36 | 37 | This crawler is not depending on other crawlers. 38 | -------------------------------------------------------------------------------- /iyp/crawlers/openintel/crux.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.openintel import OpenIntelCrawler 6 | 7 | URL = 'https://data.openintel.nl/data/' 8 | ORG = 'OpenINTEL' 9 | NAME = 'openintel.crux' 10 | 11 | DATASET = 'crux' 12 | 13 | 14 | class Crawler(OpenIntelCrawler): 15 | def __init__(self, organization, url, name): 16 | super().__init__(organization, url, name, DATASET) 17 | 18 | 19 | def main() -> None: 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--unit-test', action='store_true') 22 | args = parser.parse_args() 23 | 24 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 25 | logging.basicConfig( 26 | format=FORMAT, 27 | filename='log/' + NAME + '.log', 28 | level=logging.INFO, 29 | datefmt='%Y-%m-%d %H:%M:%S' 30 | ) 31 | 32 | logging.info(f'Started: {sys.argv}') 33 | 34 | crawler = Crawler(ORG, URL, NAME) 35 | if args.unit_test: 36 | crawler.unit_test() 37 | else: 38 | crawler.run() 39 | crawler.close() 40 | logging.info(f'Finished: {sys.argv}') 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | sys.exit(0) 46 | -------------------------------------------------------------------------------- /iyp/crawlers/openintel/dnsgraph_crux.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.openintel import DnsgraphCrawler 6 | 7 | URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/CRUX' 8 | ORG = 'OpenINTEL' 9 | NAME = 'openintel.dnsgraph_crux' 10 | 11 | 12 | class Crawler(DnsgraphCrawler): 13 | def __init__(self, organization, url, name): 14 | super().__init__(organization, url, name) 15 | 16 | 17 | def main() -> None: 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--unit-test', action='store_true') 20 | args = parser.parse_args() 21 | 22 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 23 | logging.basicConfig( 24 | format=FORMAT, 25 | filename='log/' + NAME + '.log', 26 | level=logging.INFO, 27 | datefmt='%Y-%m-%d %H:%M:%S' 28 | ) 29 | 30 | logging.info(f'Started: {sys.argv}') 31 | 32 | crawler = Crawler(ORG, URL, NAME) 33 | if args.unit_test: 34 | crawler.unit_test() 35 | else: 36 | crawler.run() 37 | crawler.close() 38 | logging.info(f'Finished: {sys.argv}') 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | sys.exit(0) 44 | -------------------------------------------------------------------------------- /iyp/crawlers/openintel/dnsgraph_jp.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.openintel import DnsgraphCrawler 6 | 7 | URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/JP' 8 | ORG = 'OpenINTEL' 9 | NAME = 'openintel.dnsgraph_jp' 10 | 11 | 12 | class Crawler(DnsgraphCrawler): 13 | def __init__(self, organization, url, name): 14 | super().__init__(organization, url, name) 15 | 16 | 17 | def main() -> None: 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--unit-test', action='store_true') 20 | args = parser.parse_args() 21 | 22 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 23 | logging.basicConfig( 24 | format=FORMAT, 25 | filename='log/' + NAME + '.log', 26 | level=logging.INFO, 27 | datefmt='%Y-%m-%d %H:%M:%S' 28 | ) 29 | 30 | logging.info(f'Started: {sys.argv}') 31 | 32 | crawler = Crawler(ORG, URL, NAME) 33 | if args.unit_test: 34 | crawler.unit_test() 35 | else: 36 | crawler.run() 37 | crawler.close() 38 | logging.info(f'Finished: {sys.argv}') 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | sys.exit(0) 44 | -------------------------------------------------------------------------------- /iyp/crawlers/openintel/dnsgraph_nl.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.openintel import DnsgraphCrawler 6 | 7 | URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/NL' 8 | ORG = 'OpenINTEL' 9 | NAME = 'openintel.dnsgraph_nl' 10 | 11 | 12 | class Crawler(DnsgraphCrawler): 13 | def __init__(self, organization, url, name): 14 | super().__init__(organization, url, name) 15 | 16 | 17 | def main() -> None: 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--unit-test', action='store_true') 20 | args = parser.parse_args() 21 | 22 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 23 | logging.basicConfig( 24 | format=FORMAT, 25 | filename='log/' + NAME + '.log', 26 | level=logging.INFO, 27 | datefmt='%Y-%m-%d %H:%M:%S' 28 | ) 29 | 30 | logging.info(f'Started: {sys.argv}') 31 | 32 | crawler = Crawler(ORG, URL, NAME) 33 | if args.unit_test: 34 | crawler.unit_test() 35 | else: 36 | crawler.run() 37 | crawler.close() 38 | logging.info(f'Finished: {sys.argv}') 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | sys.exit(0) 44 | -------------------------------------------------------------------------------- /iyp/crawlers/openintel/dnsgraph_rdns.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.openintel import DnsgraphCrawler 6 | 7 | URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/RDNS' 8 | ORG = 'OpenINTEL' 9 | NAME = 'openintel.dnsgraph_rdns' 10 | 11 | 12 | class Crawler(DnsgraphCrawler): 13 | def __init__(self, organization, url, name): 14 | super().__init__(organization, url, name) 15 | 16 | 17 | def main() -> None: 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--unit-test', action='store_true') 20 | args = parser.parse_args() 21 | 22 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 23 | logging.basicConfig( 24 | format=FORMAT, 25 | filename='log/' + NAME + '.log', 26 | level=logging.INFO, 27 | datefmt='%Y-%m-%d %H:%M:%S' 28 | ) 29 | 30 | logging.info(f'Started: {sys.argv}') 31 | 32 | crawler = Crawler(ORG, URL, NAME) 33 | if args.unit_test: 34 | crawler.unit_test() 35 | else: 36 | crawler.run() 37 | crawler.close() 38 | logging.info(f'Finished: {sys.argv}') 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | sys.exit(0) 44 | -------------------------------------------------------------------------------- /iyp/crawlers/openintel/infra_mx.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.openintel import OpenIntelCrawler 6 | 7 | URL = 'https://data.openintel.nl' 8 | ORG = 'OpenINTEL' 9 | NAME = 'openintel.infra_mx' 10 | 11 | DATASET = 'infra:mx' 12 | NODE_TYPE = 'MailServer' 13 | 14 | 15 | class Crawler(OpenIntelCrawler): 16 | def __init__(self, organization, url, name): 17 | super().__init__(organization, url, name, DATASET, NODE_TYPE) 18 | 19 | 20 | def main() -> None: 21 | 22 | ############################################ 23 | # This crawler is not working the NODE_TYPE argument has been deprecated 24 | ############################################ 25 | return 26 | 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('--unit-test', action='store_true') 29 | args = parser.parse_args() 30 | 31 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 32 | logging.basicConfig( 33 | format=FORMAT, 34 | filename='log/' + NAME + '.log', 35 | level=logging.INFO, 36 | datefmt='%Y-%m-%d %H:%M:%S' 37 | ) 38 | 39 | logging.info(f'Started: {sys.argv}') 40 | 41 | crawler = Crawler(ORG, URL, NAME) 42 | if args.unit_test: 43 | crawler.unit_test() 44 | else: 45 | crawler.run() 46 | crawler.close() 47 | logging.info(f'Finished: {sys.argv}') 48 | 49 | 50 | if __name__ == '__main__': 51 | main() 52 | sys.exit(0) 53 | -------------------------------------------------------------------------------- /iyp/crawlers/openintel/infra_ns.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.openintel import OpenIntelCrawler 6 | 7 | URL = 'https://data.openintel.nl' 8 | ORG = 'OpenINTEL' 9 | NAME = 'openintel.infra_ns' 10 | 11 | DATASET = 'infra:ns' 12 | 13 | 14 | class Crawler(OpenIntelCrawler): 15 | def __init__(self, organization, url, name): 16 | super().__init__(organization, url, name, DATASET) 17 | 18 | 19 | def main() -> None: 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--unit-test', action='store_true') 22 | args = parser.parse_args() 23 | 24 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 25 | logging.basicConfig( 26 | format=FORMAT, 27 | filename='log/' + NAME + '.log', 28 | level=logging.INFO, 29 | datefmt='%Y-%m-%d %H:%M:%S' 30 | ) 31 | 32 | logging.info(f'Started: {sys.argv}') 33 | 34 | crawler = Crawler(ORG, URL, NAME) 35 | if args.unit_test: 36 | crawler.unit_test() 37 | else: 38 | crawler.run() 39 | crawler.close() 40 | logging.info(f'Finished: {sys.argv}') 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | sys.exit(0) 46 | -------------------------------------------------------------------------------- /iyp/crawlers/openintel/tranco1m.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.openintel import OpenIntelCrawler 6 | 7 | URL = 'https://data.openintel.nl/data/tranco1m/' 8 | ORG = 'OpenINTEL' 9 | NAME = 'openintel.tranco1m' 10 | 11 | DATASET = 'tranco' 12 | 13 | 14 | class Crawler(OpenIntelCrawler): 15 | def __init__(self, organization, url, name): 16 | super().__init__(organization, url, name, DATASET) 17 | 18 | 19 | def main() -> None: 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--unit-test', action='store_true') 22 | args = parser.parse_args() 23 | 24 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 25 | logging.basicConfig( 26 | format=FORMAT, 27 | filename='log/' + NAME + '.log', 28 | level=logging.INFO, 29 | datefmt='%Y-%m-%d %H:%M:%S' 30 | ) 31 | 32 | logging.info(f'Started: {sys.argv}') 33 | 34 | crawler = Crawler(ORG, URL, NAME) 35 | if args.unit_test: 36 | crawler.unit_test() 37 | else: 38 | crawler.run() 39 | crawler.close() 40 | logging.info(f'Finished: {sys.argv}') 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | sys.exit(0) 46 | -------------------------------------------------------------------------------- /iyp/crawlers/openintel/umbrella1m.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.openintel import OpenIntelCrawler 6 | 7 | URL = 'https://data.openintel.nl/data/umbrella1m/' 8 | ORG = 'OpenINTEL' 9 | NAME = 'openintel.umbrella1m' 10 | 11 | DATASET = 'umbrella' 12 | 13 | 14 | class Crawler(OpenIntelCrawler): 15 | def __init__(self, organization, url, name): 16 | super().__init__(organization, url, name, DATASET) 17 | 18 | 19 | def main() -> None: 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--unit-test', action='store_true') 22 | args = parser.parse_args() 23 | 24 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 25 | logging.basicConfig( 26 | format=FORMAT, 27 | filename='log/' + NAME + '.log', 28 | level=logging.INFO, 29 | datefmt='%Y-%m-%d %H:%M:%S' 30 | ) 31 | 32 | logging.info(f'Started: {sys.argv}') 33 | 34 | crawler = Crawler(ORG, URL, NAME) 35 | if args.unit_test: 36 | crawler.unit_test() 37 | else: 38 | crawler.run() 39 | crawler.close() 40 | logging.info(f'Finished: {sys.argv}') 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | sys.exit(0) 46 | -------------------------------------------------------------------------------- /iyp/crawlers/pch/README.md: -------------------------------------------------------------------------------- 1 | # Packet Clearing House -- https://www.pch.net/ 2 | 3 | Packet Clearing House (PCH) is an international organization responsible for providing operational 4 | support and security to critical Internet infrastructure, including Internet exchange points and the 5 | core of the domain name system. 6 | 7 | PCH operates route collectors at more than 100 Internet Exchange Points around the world. 8 | [Data](https://www.pch.net/resources/Routing_Data/) from these route collectors is made available 9 | publicly for the benefit of the Internet's operational and research communities. 10 | 11 | IYP fetches the *Daily snapshots of the results of "show ip bgp" on PCH route collectors*, which 12 | indicate the state of the routing table on PCH route collectors at the moment in time that the 13 | snapshot is taken. 14 | 15 | IYP uses the announced routes to infer the origin ASes of announced prefixes, some of which might 16 | not be visible in route collectors from Route Views or RIPE RIS. 17 | 18 | ## Graph representation 19 | 20 | ```Cypher 21 | (:AS {asn: 2497})-[:ORIGINATE {count: 4}]->(:BGPPrefix {prefix: '101.128.128.0/17'}) 22 | 23 | ``` 24 | 25 | The `ORIGINATE` relationship contains the property `count` that, similar to the relationship 26 | produced by `bgpkit.pfx2asn`, indicates by how many route collectors the announcement was seen. 27 | A detailed list of collector names is also available via the `seen_by_collectors` property. 28 | 29 | ## Dependence 30 | 31 | This crawler may create new `BGPPrefix` nodes that miss the `af` property, so the 32 | `iyp.post.address_family` postprocessing script should be run after this. 33 | -------------------------------------------------------------------------------- /iyp/crawlers/pch/daily_routing_snapshots_v4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.pch import RoutingSnapshotCrawler 6 | 7 | ORG = 'Packet Clearing House' 8 | URL = 'https://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/' 9 | NAME = 'pch.daily_routing_snapshots_v4' 10 | 11 | 12 | class Crawler(RoutingSnapshotCrawler): 13 | def __init__(self, organization, url, name): 14 | self.name = name 15 | super().__init__(organization, url, name, af=4) 16 | 17 | 18 | def main() -> None: 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--unit-test', action='store_true') 21 | args = parser.parse_args() 22 | 23 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 24 | logging.basicConfig( 25 | format=FORMAT, 26 | filename='log/' + NAME + '.log', 27 | level=logging.INFO, 28 | datefmt='%Y-%m-%d %H:%M:%S' 29 | ) 30 | 31 | logging.info(f'Started: {sys.argv}') 32 | 33 | crawler = Crawler(ORG, URL, NAME) 34 | if args.unit_test: 35 | crawler.unit_test() 36 | else: 37 | crawler.run() 38 | crawler.close() 39 | logging.info(f'Finished: {sys.argv}') 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | sys.exit(0) 45 | -------------------------------------------------------------------------------- /iyp/crawlers/pch/daily_routing_snapshots_v6.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp.crawlers.pch import RoutingSnapshotCrawler 6 | 7 | ORG = 'Packet Clearing House' 8 | URL = 'https://www.pch.net/resources/Routing_Data/IPv6_daily_snapshots/' 9 | NAME = 'pch.daily_routing_snapshots_v6' 10 | 11 | 12 | class Crawler(RoutingSnapshotCrawler): 13 | def __init__(self, organization, url, name): 14 | self.name = name 15 | super().__init__(organization, url, name, af=6) 16 | 17 | 18 | def main() -> None: 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--unit-test', action='store_true') 21 | args = parser.parse_args() 22 | 23 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 24 | logging.basicConfig( 25 | format=FORMAT, 26 | filename='log/' + NAME + '.log', 27 | level=logging.INFO, 28 | datefmt='%Y-%m-%d %H:%M:%S' 29 | ) 30 | 31 | logging.info(f'Started: {sys.argv}') 32 | 33 | crawler = Crawler(ORG, URL, NAME) 34 | if args.unit_test: 35 | crawler.unit_test() 36 | else: 37 | crawler.run() 38 | crawler.close() 39 | logging.info(f'Finished: {sys.argv}') 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | sys.exit(0) 45 | -------------------------------------------------------------------------------- /iyp/crawlers/ripe/README.md: -------------------------------------------------------------------------------- 1 | # RIPE NCC -- https://www.ripe.net/ 2 | 3 | The RIPE Network Coordination Centre (RIPE NCC) is the Regional Internet Registry (RIR) for Europe, 4 | the Middle East and parts of Central Asia. 5 | 6 | IYP does not use RIPE-specific information, but fetches data from RIPE‘s convenient [FTP 7 | server](https://ftp.ripe.net/). 8 | 9 | ## Graph representation 10 | 11 | ### AS names - `as_names.py` 12 | 13 | RIPE NCC provides a simple [list of AS names](https://ftp.ripe.net/ripe/asnames/) (also containing a 14 | country code) which is the base of many AS nodes in IYP. 15 | 16 | ```Cypher 17 | (:AS {asn: 2497})-[:NAME]->(:Name {name: 'IIJ Internet Initiative Japan Inc.'}) 18 | (:AS {asn: 2497})-[:COUNTRY]->(:Country {country_code: 'JP'}) 19 | ``` 20 | 21 | ### Route Origin Authorizations - `roa.py` 22 | 23 | A Route Origin Authorization (ROA) is a cryptographically signed object that states which AS is 24 | authorized to originate a particular IP address prefix or set of prefixes. 25 | 26 | IYP uses RIPE NCC‘s [mirror of Trust Anchor Locators](https://ftp.ripe.net/rpki/) of the five RIRs 27 | to extract ROA information. The max length specification of the ROA is added as the `maxLength` 28 | property on the relationship. 29 | 30 | ```Cypher 31 | (:AS {asn: 2497})-[:ROUTE_ORIGIN_AUTHORIZATION {maxLength: 18}]->(:RPKIPrefix {prefix: '49.239.64.0/18'}) 32 | ``` 33 | 34 | ### Atlas Probes - `atlas_probes.py` 35 | 36 | [RIPE Atlas](https://atlas.ripe.net/) is an Internet measurement platform maintained by 37 | the RIPE NCC. RIPE Atlas employs a global network of probes that measure Internet 38 | connectivity and reachability. 39 | 40 | We fetch the [list of probes](https://atlas.ripe.net/docs/apis/rest-api-manual/probes/) 41 | to model the location of Atlas probes in the graph topology based on the probe's IP(s), 42 | ASN(s), and country. 43 | 44 | ```Cypher 45 | (:AtlasProbe {id: 6425})-[:LOCATED_IN]->(:AS {asn: 2497}) 46 | (:AtlasProbe {id: 6425})-[:COUNTRY]->(:Country {country_code: 'JP'}) 47 | (:IP {ip: '202.214.97.16'})-[:ASSIGNED]->(:AtlasProbe {id: 6425}) 48 | ``` 49 | 50 | ### Atlas Measurements - `atlas_measurements.py` 51 | 52 | We fetch the [list of 53 | measurements](https://atlas.ripe.net/docs/apis/rest-api-manual/measurements/) 54 | to obtain metadata of *ongoing* Atlas measurements. `AtlasProbe`s are `PART_OF` 55 | `AtlasMeasurement`s and measurements `TARGET` one or more `IP`s, a `HostName`, or 56 | both. The Atlas platform also maps the measurement target to an `AS` number if possible. 57 | The crawler includes this relationship as well. 58 | 59 | To reduce the number of `PART_OF` relationships, this crawler ignores probes that were 60 | never connected or are abandoned. 61 | 62 | ```Cypher 63 | (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:AS {asn: 2497}) 64 | (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:HostName {name: 'jp-tyo-as2497.anchors.atlas.ripe.net'}) 65 | (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:IP {ip: '202.214.87.158'}) 66 | ``` 67 | 68 | ## Dependence 69 | 70 | The `atlas_measurement` crawler fetches probe IDs for abandoned and never-connected 71 | probes and thus should be run after the `atlas_probes` crawler. 72 | -------------------------------------------------------------------------------- /iyp/crawlers/ripe/as_names.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | import requests 6 | 7 | from iyp import BaseCrawler, set_modification_time_from_last_modified_header 8 | 9 | URL = 'https://ftp.ripe.net/ripe/asnames/asn.txt' 10 | ORG = 'RIPE NCC' 11 | NAME = 'ripe.as_names' 12 | 13 | 14 | class Crawler(BaseCrawler): 15 | 16 | def run(self): 17 | """Fetch the AS name file from RIPE website and process lines one by one.""" 18 | 19 | req = requests.get(URL) 20 | req.raise_for_status() 21 | 22 | set_modification_time_from_last_modified_header(self.reference, req) 23 | 24 | lines = [] 25 | asns = set() 26 | names = set() 27 | countries = set() 28 | 29 | # Read asn file 30 | for line in req.text.splitlines(): 31 | asn, _, name_cc = line.partition(' ') 32 | name, _, cc = name_cc.rpartition(', ') 33 | 34 | if not all((asn, name, cc)) or len(cc) > 2: 35 | logging.warning(f'Ignoring invalid line: "{line}"') 36 | continue 37 | 38 | asn = int(asn) 39 | lines.append([asn, name, cc]) 40 | 41 | asns.add(asn) 42 | names.add(name) 43 | countries.add(cc) 44 | 45 | # get node IDs for ASNs, names, and countries 46 | asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns) 47 | name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names) 48 | country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', countries) 49 | 50 | # Compute links 51 | name_links = [] 52 | country_links = [] 53 | 54 | for asn, name, cc in lines: 55 | asn_qid = asn_id[asn] 56 | name_qid = name_id[name] 57 | country_qid = country_id[cc] 58 | 59 | name_links.append({'src_id': asn_qid, 'dst_id': name_qid, 60 | 'props': [self.reference]}) # Set AS name 61 | country_links.append({'src_id': asn_qid, 'dst_id': country_qid, 62 | 'props': [self.reference]}) # Set country 63 | 64 | # Push all links to IYP 65 | self.iyp.batch_add_links('NAME', name_links) 66 | self.iyp.batch_add_links('COUNTRY', country_links) 67 | 68 | def unit_test(self): 69 | return super().unit_test(['NAME', 'COUNTRY']) 70 | 71 | 72 | def main() -> None: 73 | parser = argparse.ArgumentParser() 74 | parser.add_argument('--unit-test', action='store_true') 75 | args = parser.parse_args() 76 | 77 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 78 | logging.basicConfig( 79 | format=FORMAT, 80 | filename='log/' + NAME + '.log', 81 | level=logging.INFO, 82 | datefmt='%Y-%m-%d %H:%M:%S' 83 | ) 84 | 85 | logging.info(f'Started: {sys.argv}') 86 | 87 | crawler = Crawler(ORG, URL, NAME) 88 | if args.unit_test: 89 | crawler.unit_test() 90 | else: 91 | crawler.run() 92 | crawler.close() 93 | logging.info(f'Finished: {sys.argv}') 94 | 95 | 96 | if __name__ == '__main__': 97 | main() 98 | sys.exit(0) 99 | -------------------------------------------------------------------------------- /iyp/crawlers/simulamet/README.md: -------------------------------------------------------------------------------- 1 | # rDNS RIR data -- https://rir-data.org/ 2 | 3 | "Lowering the Barriers to Working with Public RIR-Level Data" is a joint project of 4 | SimulaMet and the University of Twente with the goal of making WHOIS, route object 5 | delegation, and reverse DNS (rDNS) zone files published by Regional Internet Registries 6 | (RIRs) more accessible. 7 | 8 | IYP imports the rDNS files in a simplified format to indicate which authoritative name 9 | servers are responsible for a prefix. We do not model PTR records and the corresponding 10 | hierarchy but instead add a simple MANAGED_BY link. 11 | 12 | ## Graph representation 13 | 14 | ```cypher 15 | (:RDNSPrefix {prefix: '103.2.57.0/24'})-[:MANAGED_BY {source: 'APNIC', ttl: 172800}]->(:AuthoritativeNameServer {name: 'dns0.iij.ad.jp'}) 16 | ``` 17 | 18 | The `source` property indicates from which RIR the information was obtained, the `ttl` 19 | property refers to the time-to-live of the associated SOA record. 20 | 21 | ## Dependence 22 | 23 | This crawler is not depending on other crawlers. 24 | -------------------------------------------------------------------------------- /iyp/crawlers/stanford/README.md: -------------------------------------------------------------------------------- 1 | # Stanford's ASdb -- https://asdb.stanford.edu/ 2 | 3 | ASdb is a research dataset that maps public autonomous systems (identified by 4 | ASN) to organizations and up to three industry types using data from business 5 | intelligence databases, website classifiers, and a machine learning algorithm. 6 | 7 | ## Graph representation 8 | 9 | ### AS tags 10 | Connect AS to tag nodes meaning that an AS has been categorized according to the 11 | given tag. 12 | ``` 13 | (:AS {asn:32})-[:CATEGORIZED]-(:Tag {label: 'Colleges, Universities, and Professional Schools'}) 14 | ``` 15 | 16 | ## Dependence 17 | 18 | This crawler is not depending on other crawlers. 19 | -------------------------------------------------------------------------------- /iyp/crawlers/tranco/README.md: -------------------------------------------------------------------------------- 1 | # Tranco -- https://tranco-list.eu/ 2 | 3 | The Tranco list is a research-oriented top sites ranking hardened against manipulation. It [combines 4 | the rankings of several source lists](https://tranco-list.eu/methodology) to produce a daily list 5 | that is based on data of the past 30 days. 6 | 7 | IYP uses this data to create and annotate DomainName nodes. 8 | 9 | ## Graph representation 10 | 11 | The rank of the domain is indicated by the `rank` property of the relationship. 12 | 13 | ```Cypher 14 | (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Tranco top 1M'}) 15 | ``` 16 | 17 | ## Dependence 18 | 19 | This crawler is not depending on other crawlers. 20 | -------------------------------------------------------------------------------- /iyp/crawlers/tranco/top1m.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import io 3 | import logging 4 | import sys 5 | from zipfile import ZipFile 6 | 7 | import requests 8 | 9 | from iyp import BaseCrawler, set_modification_time_from_last_modified_header 10 | 11 | # URL to Tranco top 1M 12 | URL = 'https://tranco-list.eu/top-1m.csv.zip' 13 | ORG = 'imec-DistriNet' 14 | NAME = 'tranco.top1m' 15 | 16 | 17 | class Crawler(BaseCrawler): 18 | def __init__(self, organization, url, name): 19 | super().__init__(organization, url, name) 20 | self.reference['reference_url_info'] = 'https://tranco-list.eu/methodology' 21 | 22 | def __set_data_url(self): 23 | """Set the data URL using the permanent ID of the current list, which stays 24 | valid once the permalink is updated.""" 25 | try: 26 | res = requests.get('https://tranco-list.eu/top-1m-id') 27 | res.raise_for_status() 28 | self.reference['reference_url_data'] = f'https://tranco-list.eu/download_daily/{res.text}' 29 | except requests.HTTPError as e: 30 | logging.warning(f'Failed to update data URL: {e}') 31 | 32 | def run(self): 33 | """Fetch Tranco top 1M and push to IYP.""" 34 | 35 | self.tranco_qid = self.iyp.get_node('Ranking', {'name': 'Tranco top 1M'}) 36 | 37 | logging.info('Downloading latest list...') 38 | req = requests.get(URL) 39 | req.raise_for_status() 40 | 41 | set_modification_time_from_last_modified_header(self.reference, req) 42 | self.__set_data_url() 43 | 44 | links = [] 45 | domains = set() 46 | # open zip file and read top list 47 | with ZipFile(io.BytesIO(req.content)) as z: 48 | with z.open('top-1m.csv') as list: 49 | for i, row in enumerate(io.TextIOWrapper(list)): 50 | row = row.rstrip() 51 | rank, domain = row.split(',') 52 | 53 | domains.add(domain) 54 | links.append({'src_name': domain, 'dst_id': self.tranco_qid, 55 | 'props': [self.reference, {'rank': int(rank)}]}) 56 | 57 | name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains) 58 | 59 | for link in links: 60 | link['src_id'] = name_id[link['src_name']] 61 | 62 | # Push all links to IYP 63 | self.iyp.batch_add_links('RANK', links) 64 | 65 | def unit_test(self): 66 | return super().unit_test(['RANK']) 67 | 68 | 69 | def main() -> None: 70 | parser = argparse.ArgumentParser() 71 | parser.add_argument('--unit-test', action='store_true') 72 | args = parser.parse_args() 73 | 74 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 75 | logging.basicConfig( 76 | format=FORMAT, 77 | filename='log/' + NAME + '.log', 78 | level=logging.INFO, 79 | datefmt='%Y-%m-%d %H:%M:%S' 80 | ) 81 | 82 | logging.info(f'Started: {sys.argv}') 83 | 84 | crawler = Crawler(ORG, URL, NAME) 85 | if args.unit_test: 86 | crawler.unit_test() 87 | else: 88 | crawler.run() 89 | crawler.close() 90 | logging.info(f'Finished: {sys.argv}') 91 | 92 | 93 | if __name__ == '__main__': 94 | main() 95 | sys.exit(0) 96 | -------------------------------------------------------------------------------- /iyp/crawlers/virginiatech/README.md: -------------------------------------------------------------------------------- 1 | # RoVista -- https://rovista.netsecurelab.org/ 2 | 3 | > RoVista aims to determine the Routing Origin Validation (ROV) status of network 4 | > operators. 5 | > 6 | > RoV Scores are determined based on the number of RPKI-invalid prefixes reachable by an 7 | > Autonomous System (AS). Consequently, a higher ROV score suggests that the AS can 8 | > effectively filter more RPKI-invalid prefixes. However, it is important to note that 9 | > the RoV score does not conclusively indicate whether an AS has actually implemented 10 | > ROV or not, partly due to limitations in [the] framework and other contributing 11 | > factors. 12 | 13 | IYP converts these scores (or ratios) to two Tags: 14 | 15 | - ASes with a ratio greater than 0.5 are categorized as `Validating RPKI ROV` 16 | - ASes with a ratio of less or equal 0.5 are categorized as `Not Validating RPKI ROV` 17 | 18 | ## Graph representation 19 | 20 | ```cypher 21 | (:AS {asn: 2497})-[:CATEGORIZED {ratio: 1.0}]->(:Tag {label: 'Validating RPKI ROV'}) 22 | (:AS {asn: 6762})-[:CATEGORIZED {ratio: 0}]->(:Tag {label: 'Not Validating RPKI ROV'}) 23 | ``` 24 | 25 | ## Dependence 26 | 27 | This crawler is not depending on other crawlers. 28 | -------------------------------------------------------------------------------- /iyp/crawlers/virginiatech/rovista.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from datetime import datetime, timezone 5 | 6 | import requests 7 | 8 | from iyp import BaseCrawler 9 | 10 | URL = 'https://api.rovista.netsecurelab.org/rovista/api/overview' 11 | ORG = 'Virginia Tech' 12 | NAME = 'virginiatech.rovista' 13 | 14 | 15 | class Crawler(BaseCrawler): 16 | def __init__(self, organization, url, name): 17 | super().__init__(organization, url, name) 18 | self.reference['reference_url_info'] = 'https://rovista.netsecurelab.org/' 19 | 20 | def __set_modification_time(self, entry): 21 | try: 22 | date_str = entry['lastUpdatedDate'] 23 | date = datetime.strptime(date_str, '%Y-%m-%d').replace(tzinfo=timezone.utc) 24 | self.reference['reference_time_modification'] = date 25 | except (KeyError, ValueError) as e: 26 | logging.warning(f'Failed to set modification time: {e}') 27 | 28 | def run(self): 29 | """Get RoVista data from their API.""" 30 | batch_size = 1000 31 | offset = 0 32 | entries = [] 33 | asns = set() 34 | 35 | while True: 36 | # Make a request with the current offset 37 | response = requests.get(URL, params={'offset': offset, 'count': batch_size}) 38 | response.raise_for_status() 39 | 40 | data = response.json().get('data', []) 41 | for entry in data: 42 | if not self.reference['reference_time_modification']: 43 | self.__set_modification_time(entry) 44 | asns.add(entry['asn']) 45 | if entry['ratio'] > 0.5: 46 | entries.append({'asn': entry['asn'], 'ratio': entry['ratio']}) 47 | else: 48 | entries.append({'asn': entry['asn'], 'ratio': entry['ratio']}) 49 | 50 | # Move to the next page 51 | offset += 1 52 | # Break the loop if there's no more data 53 | if len(data) < batch_size: 54 | break 55 | 56 | # get ASNs and prefixes IDs 57 | self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns) 58 | tag_id_not_valid = self.iyp.get_node('Tag', {'label': 'Not Validating RPKI ROV'}) 59 | tag_id_valid = self.iyp.get_node('Tag', {'label': 'Validating RPKI ROV'}) 60 | # Compute links 61 | links = [] 62 | for entry in entries: 63 | asn_qid = self.asn_id[entry['asn']] 64 | if entry['ratio'] > 0.5: 65 | links.append({'src_id': asn_qid, 'dst_id': tag_id_valid, 66 | 'props': [self.reference, {'ratio': entry['ratio']}]}) 67 | else: 68 | links.append({'src_id': asn_qid, 'dst_id': tag_id_not_valid, 69 | 'props': [self.reference, {'ratio': entry['ratio']}]}) 70 | 71 | # Push all links to IYP 72 | self.iyp.batch_add_links('CATEGORIZED', links) 73 | 74 | def unit_test(self): 75 | return super().unit_test(['CATEGORIZED']) 76 | 77 | 78 | def main() -> None: 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument('--unit-test', action='store_true') 81 | args = parser.parse_args() 82 | 83 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 84 | logging.basicConfig( 85 | format=FORMAT, 86 | filename='log/' + NAME + '.log', 87 | level=logging.INFO, 88 | datefmt='%Y-%m-%d %H:%M:%S' 89 | ) 90 | 91 | logging.info(f'Started: {sys.argv}') 92 | 93 | crawler = Crawler(ORG, URL, NAME) 94 | if args.unit_test: 95 | crawler.unit_test() 96 | else: 97 | crawler.run() 98 | crawler.close() 99 | logging.info(f'Finished: {sys.argv}') 100 | 101 | 102 | if __name__ == '__main__': 103 | main() 104 | sys.exit(0) 105 | -------------------------------------------------------------------------------- /iyp/crawlers/worldbank/README.md: -------------------------------------------------------------------------------- 1 | # World Bank's country population -- https://www.worldbank.org/en/home 2 | 3 | > The World Bank Group works in every major area of development. We provide a wide array of financial products and technical assistance, and we help countries share and apply innovative knowledge and solutions to the challenges they face. 4 | 5 | > The World Bank is like a cooperative, made up of 189 member countries. These member countries, or shareholders, are represented by a Board of Governors, who are the ultimate policymakers at the World Bank. Generally, the governors are member countries' ministers of finance or ministers of development. They meet once a year at the Annual Meetings of the Boards of Governors of the World Bank Group and the International Monetary Fund. 6 | 7 | ## Graph representation 8 | 9 | ### Country Estimate 10 | Connect `Country` to an `Estimate` node meaning that a country has an estimated population of `value`. 11 | ``` 12 | (:Country)-[:POPULATION {value: 123}]->(:Estimate {name: 'World Bank Population Estimate'}) 13 | ``` 14 | 15 | ## Dependence 16 | This crawler depends on crawlers creating Country nodes. -------------------------------------------------------------------------------- /iyp/crawlers/worldbank/country_pop.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import sys 5 | from datetime import datetime, timezone 6 | 7 | import requests 8 | 9 | from iyp import BaseCrawler 10 | 11 | URL = 'https://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL?per_page=400&mrv=1&format=json' 12 | ORG = 'WorldBank' 13 | NAME = 'worldbank.country_pop' 14 | 15 | 16 | class Crawler(BaseCrawler): 17 | def __init__(self, organization, url, name): 18 | super().__init__(organization, url, name) 19 | self.reference['reference_url_info'] = ( 20 | 'https://datahelpdesk.worldbank.org/knowledgebase/articles/' 21 | '889392-about-the-indicators-api-documentation' 22 | ) 23 | 24 | def run(self): 25 | """Get country population from Worldbank API and push it to IYP.""" 26 | 27 | # Get content 28 | req = requests.get(URL) 29 | req.raise_for_status() 30 | content = json.loads(req.content) 31 | 32 | # Set last time of modification 33 | self.reference['reference_time_modification'] = datetime.strptime(content[0]['lastupdated'], 34 | '%Y-%m-%d').replace(tzinfo=timezone.utc) 35 | 36 | # Get countries present in IYP cc to id mapping 37 | country_ids = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', create=False, all=True) 38 | 39 | # Get countries and population from World Bank 40 | lines = set() 41 | for entry in content[1]: 42 | 43 | country = entry['country']['id'] 44 | if country not in country_ids or not entry['value']: 45 | continue 46 | 47 | population = entry['value'] 48 | lines.add((country, population)) 49 | 50 | # Get `Estimate` node ID 51 | estimate_qid = self.iyp.get_node('Estimate', properties={'name': 'World Bank Population Estimate'}) 52 | 53 | # Compute links 54 | links = [] 55 | for (country, population) in lines: 56 | 57 | country_qid = country_ids[country] 58 | 59 | links.append({'src_id': country_qid, 'dst_id': estimate_qid, 60 | 'props': [self.reference, {'value': population}]}) 61 | 62 | # Push all links to IYP 63 | self.iyp.batch_add_links('POPULATION', links) 64 | 65 | def unit_test(self): 66 | return super().unit_test(['POPULATION']) 67 | 68 | 69 | def main() -> None: 70 | parser = argparse.ArgumentParser() 71 | parser.add_argument('--unit-test', action='store_true') 72 | args = parser.parse_args() 73 | 74 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 75 | logging.basicConfig( 76 | format=FORMAT, 77 | filename='log/' + NAME + '.log', 78 | level=logging.INFO, 79 | datefmt='%Y-%m-%d %H:%M:%S' 80 | ) 81 | 82 | logging.info(f'Started: {sys.argv}') 83 | 84 | crawler = Crawler(ORG, URL, NAME) 85 | if args.unit_test: 86 | crawler.unit_test() 87 | else: 88 | crawler.run() 89 | crawler.close() 90 | logging.info(f'Finished: {sys.argv}') 91 | 92 | 93 | if __name__ == '__main__': 94 | main() 95 | sys.exit(0) 96 | -------------------------------------------------------------------------------- /iyp/post/address_family.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp import BasePostProcess 6 | 7 | NAME = 'post.address_family' 8 | 9 | 10 | class PostProcess(BasePostProcess): 11 | def run(self): 12 | """Add address family (4 or 6 for IPv4 or IPv6) to all IP and Prefix nodes.""" 13 | 14 | # Update prefixes 15 | self.iyp.tx.run("MATCH (pfx:Prefix) WHERE pfx.prefix CONTAINS '.' SET pfx.af = 4") 16 | self.iyp.commit() 17 | self.iyp.tx.run("MATCH (pfx:Prefix) WHERE pfx.prefix CONTAINS ':' SET pfx.af = 6") 18 | self.iyp.commit() 19 | 20 | # Update IP addresses 21 | self.iyp.tx.run("MATCH (ip:IP) WHERE ip.ip CONTAINS '.' SET ip.af = 4") 22 | self.iyp.commit() 23 | self.iyp.tx.run("MATCH (ip:IP) WHERE ip.ip CONTAINS ':' SET ip.af = 6") 24 | self.iyp.commit() 25 | 26 | def unit_test(self): 27 | raise NotImplementedError() 28 | 29 | 30 | def main() -> None: 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('--unit-test', action='store_true') 33 | args = parser.parse_args() 34 | 35 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 36 | logging.basicConfig( 37 | format=FORMAT, 38 | filename='log/' + NAME + '.log', 39 | level=logging.INFO, 40 | datefmt='%Y-%m-%d %H:%M:%S' 41 | ) 42 | 43 | logging.info(f'Started: {sys.argv}') 44 | 45 | post = PostProcess(NAME) 46 | if args.unit_test: 47 | post.unit_test() 48 | else: 49 | post.run() 50 | post.close() 51 | logging.info(f'Finished: {sys.argv}') 52 | 53 | 54 | if __name__ == '__main__': 55 | main() 56 | sys.exit(0) 57 | -------------------------------------------------------------------------------- /iyp/post/clean_links.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | from iyp import BasePostProcess 6 | 7 | NAME = 'post.clean_links' 8 | 9 | 10 | class PostProcess(BasePostProcess): 11 | def get_links_of_type(self, link_type, prop_dict=None): 12 | """Returns a list of all links of a given type with optional properties, 13 | including the source and destination nodes. 14 | 15 | Parameters: 16 | - link_type: The type of links to return. 17 | - prop_dict: Optional dictionary of properties to return. 18 | 19 | Returns: 20 | - List of links with the specified type and properties, including source and 21 | destination nodes. 22 | """ 23 | prop_conditions = ( 24 | ' AND '.join([f"r.{k} = '{v}'" for k, v in prop_dict.items()]) 25 | if prop_dict 26 | else 'TRUE' 27 | ) 28 | prop_str = ', '.join([f'r.{k}' for k in prop_dict.keys()]) if prop_dict else '*' 29 | 30 | query = f""" 31 | MATCH (src)-[r:{link_type}]->(dst) 32 | WHERE {prop_conditions} 33 | RETURN elementId(r) AS link_id, elementId(src) AS src_id, elementId(dst) AS dst_id, {prop_str} 34 | """ 35 | result = self.iyp.tx.run(query) 36 | if result: 37 | return [record for record in result] 38 | else: 39 | return None 40 | 41 | def delete_links(self, link_ids): 42 | """Deletes all links in the given list. 43 | 44 | Parameters: 45 | - link_ids: List of link IDs to delete. 46 | 47 | Returns: 48 | - None 49 | """ 50 | query = """ 51 | UNWIND $link_ids AS link_id 52 | MATCH ()-[r]->() 53 | WHERE elementId(r) = link_id 54 | DELETE r 55 | """ 56 | self.iyp.tx.run(query, link_ids=link_ids) 57 | 58 | def clean_links_of_type(self, link_type, prop_dict=None): 59 | links = self.get_links_of_type(link_type, prop_dict) 60 | link_dict = {} 61 | for link in links: 62 | key = (link['src_id'], link['dst_id']) 63 | if key not in link_dict: 64 | link_dict[key] = [] 65 | link_dict[key].append(link['link_id']) 66 | 67 | # Create the new list of link IDs excluding the first one for each (src_id, 68 | # dst_id) pair 69 | filtered_link_ids = [] 70 | for key, link_ids in link_dict.items(): 71 | if len(link_ids) > 1: 72 | filtered_link_ids.extend(link_ids[1:]) 73 | 74 | self.delete_links(filtered_link_ids) 75 | 76 | def run(self): 77 | # Clean links of all types with the reference_org 'OONI' 78 | link_types = ['COUNTRY', 'RESOLVES_TO', 'PART_OF', 'CATEGORIZED'] 79 | for link_type in link_types: 80 | self.clean_links_of_type(link_type, {'reference_org': 'OONI'}) 81 | 82 | def unit_test(self): 83 | raise NotImplementedError() 84 | 85 | 86 | def main() -> None: 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument('--unit-test', action='store_true') 89 | args = parser.parse_args() 90 | 91 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 92 | logging.basicConfig( 93 | format=FORMAT, 94 | filename='log/' + NAME + '.log', 95 | level=logging.INFO, 96 | datefmt='%Y-%m-%d %H:%M:%S', 97 | ) 98 | 99 | logging.info(f'Started: {sys.argv}') 100 | 101 | post = PostProcess(NAME) 102 | if args.unit_test: 103 | post.unit_test() 104 | else: 105 | post.run() 106 | post.close() 107 | logging.info(f'Finished: {sys.argv}') 108 | 109 | 110 | if __name__ == '__main__': 111 | main() 112 | sys.exit(0) 113 | -------------------------------------------------------------------------------- /iyp/post/country_information.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | import iso3166 6 | 7 | from iyp import BasePostProcess 8 | 9 | NAME = 'post.country_information' 10 | 11 | 12 | class PostProcess(BasePostProcess): 13 | def run(self): 14 | """Enrich Country nodes with additional information like alpha-3 codes and 15 | country names.""" 16 | 17 | country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code') 18 | 19 | for country_code in country_id: 20 | if country_code not in iso3166.countries_by_alpha2: 21 | logging.error(f'Country code "{country_code}" is not ISO 3166-1 alpha-2 conform.') 22 | continue 23 | country_info = iso3166.countries_by_alpha2[country_code] 24 | new_props = {'name': country_info.apolitical_name, 25 | 'alpha3': country_info.alpha3} 26 | self.iyp.tx.run(""" 27 | MATCH (n:Country) 28 | WHERE elementId(n) = $id 29 | SET n += $props 30 | """, 31 | id=country_id[country_code], 32 | props=new_props) 33 | self.iyp.commit() 34 | 35 | def unit_test(self): 36 | raise NotImplementedError() 37 | 38 | 39 | def main() -> None: 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument('--unit-test', action='store_true') 42 | args = parser.parse_args() 43 | 44 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 45 | logging.basicConfig( 46 | format=FORMAT, 47 | filename='log/' + NAME + '.log', 48 | level=logging.INFO, 49 | datefmt='%Y-%m-%d %H:%M:%S' 50 | ) 51 | 52 | logging.info(f'Started: {sys.argv}') 53 | 54 | post = PostProcess(NAME) 55 | if args.unit_test: 56 | post.unit_test() 57 | else: 58 | post.run() 59 | post.close() 60 | logging.info(f'Finished: {sys.argv}') 61 | 62 | 63 | if __name__ == '__main__': 64 | main() 65 | sys.exit(0) 66 | -------------------------------------------------------------------------------- /iyp/post/url2hostname.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | 5 | import tldextract 6 | 7 | from iyp import BasePostProcess 8 | 9 | NAME = 'post.url2hostname' 10 | 11 | 12 | class PostProcess(BasePostProcess): 13 | def run(self): 14 | """Link URLs and their corresponding HostNames.""" 15 | 16 | # Get all URL nodes. 17 | url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url') 18 | 19 | # Get all HostName Nodes 20 | hostname_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name') 21 | 22 | # Compute links 23 | links = [] 24 | for url, url_qid in url_id.items(): 25 | # Extract host name from URL 26 | hostname = tldextract.extract(url).fqdn 27 | 28 | # Get HostName node for the fqdn of the URL 29 | hostname_qid = hostname_id.get(hostname) 30 | 31 | if hostname_qid is not None: 32 | links.append({ 33 | 'src_id': url_qid, 34 | 'dst_id': hostname_qid, 35 | 'props': [self.reference] 36 | }) 37 | 38 | # push links to IYP 39 | self.iyp.batch_add_links('PART_OF', links) 40 | 41 | def unit_test(self): 42 | raise NotImplementedError() 43 | 44 | 45 | def main() -> None: 46 | parser = argparse.ArgumentParser() 47 | parser.add_argument('--unit-test', action='store_true') 48 | args = parser.parse_args() 49 | 50 | FORMAT = '%(asctime)s %(levelname)s %(message)s' 51 | logging.basicConfig( 52 | format=FORMAT, 53 | filename='log/' + NAME + '.log', 54 | level=logging.INFO, 55 | datefmt='%Y-%m-%d %H:%M:%S' 56 | ) 57 | 58 | logging.info(f'Started: {sys.argv}') 59 | 60 | post = PostProcess(NAME) 61 | if args.unit_test: 62 | post.unit_test() 63 | else: 64 | post.run() 65 | post.close() 66 | logging.info(f'Finished: {sys.argv}') 67 | 68 | 69 | if __name__ == '__main__': 70 | main() 71 | sys.exit(0) 72 | -------------------------------------------------------------------------------- /log/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternetHealthReport/internet-yellow-pages/c79ced8783c73f282d273a886f4aa5cf8018fdd9/log/.gitkeep -------------------------------------------------------------------------------- /public/README.md: -------------------------------------------------------------------------------- 1 | # IYP public instance 2 | IYP is served behind a NGINX reverse proxy. 3 | The [landing page](https://github.com/InternetHealthReport/iyp-website/) is at the root and the `iyp/` path is pointing to the neo4j instance (`public_notls`). 4 | 5 | Example nginx configuration: 6 | ``` 7 | location ^~ /iyp/ { 8 | proxy_pass http://10.255.255.11:7474/; 9 | } 10 | ``` 11 | 12 | The BOLT endpoint (iyp-bolt.iijlab.net) points directly to Neo4j's 7687 port. 13 | 14 | # Note 15 | The `public_tls` configuration is not currently used. This is required if the machine serving IYP is publicly accessible and directly accessed by clients. 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | neo4j 2 | iso3166 3 | progressbar2 4 | arrow 5 | tldextract 6 | py-radix 7 | requests 8 | urllib3 9 | chardet 10 | bs4 11 | requests-cache 12 | lz4 13 | frozendict 14 | docker 15 | boto3 16 | botocore 17 | pandas 18 | fastparquet 19 | flatdict 20 | requests_futures 21 | autopep8 22 | flake8 23 | pre-commit 24 | PyGithub 25 | clickhouse_driver 26 | paramiko 27 | scp 28 | -------------------------------------------------------------------------------- /run_all_crawlers.sh: -------------------------------------------------------------------------------- 1 | echo "Starting all crawlers" 2 | date 3 | 4 | # MANRS 5 | python3 -m iyp.crawlers.manrs.members 6 | 7 | # AS Names 8 | python3 -m iyp.crawlers.ripe.as_names 9 | python3 -m iyp.crawlers.bgptools.as_names 10 | python3 -m iyp.crawlers.emileaben.as_names 11 | 12 | # AS Peers 13 | python3 -m iyp.crawlers.inetintel.siblings_asdb 14 | 15 | # Rankings 16 | python3 -m iyp.crawlers.apnic.eyeball 17 | python3 -m iyp.crawlers.caida.asrank 18 | python3 -m iyp.crawlers.ihr.country_dependency 19 | 20 | # BGP data 21 | python3 -m iyp.crawlers.bgpkit.pfx2asn 22 | python3 -m iyp.crawlers.bgpkit.as2rel 23 | python3 -m iyp.crawlers.bgpkit.peerstats 24 | python3 -m iyp.crawlers.ripe.roa 25 | 26 | # IHR 27 | python3 -m iyp.crawlers.ihr.local_hegemony 28 | python3 -m iyp.crawlers.ihr.rov 29 | 30 | # DNS 31 | python3 -m iyp.crawlers.tranco.top1M 32 | python3 -m iyp.crawlers.cloudflare.top100 33 | 34 | #BGP.Tools tags, and anycast prefixes 35 | python3 -m iyp.crawlers.bgptools.tags 36 | python3 -m iyp.crawlers.bgptools.anycast_prefixes 37 | 38 | #PeeringDB 39 | python3 -m iyp.crawlers.peeringdb.org 40 | python3 -m iyp.crawlers.peeringdb.ix 41 | 42 | # Delegated files 43 | python3 -m iyp.crawlers.nro.delegated_stats 44 | 45 | # URL data 46 | python3 -m iyp.crawlers.citizenlab.urldb 47 | 48 | # OONI 49 | python3 -m iyp.crawlers.ooni.webconnectivity 50 | python3 -m iyp.crawlers.ooni.facebookmessenger 51 | python3 -m iyp.crawlers.ooni.signal 52 | python3 -m iyp.crawlers.ooni.telegram 53 | python3 -m iyp.crawlers.ooni.whatsapp 54 | python3 -m iyp.crawlers.ooni.httpheaderfieldmanipulation 55 | python3 -m iyp.crawlers.ooni.httpinvalidrequestline 56 | python3 -m iyp.crawlers.ooni.psiphon 57 | python3 -m iyp.crawlers.ooni.riseupvpn 58 | python3 -m iyp.crawlers.ooni.stunreachability 59 | python3 -m iyp.crawlers.ooni.tor 60 | python3 -m iyp.crawlers.ooni.torsf 61 | python3 -m iyp.crawlers.ooni.vanillator 62 | 63 | echo "All crawlers finished" 64 | date 65 | -------------------------------------------------------------------------------- /send_email.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import smtplib 4 | from email.message import EmailMessage 5 | from email.utils import formataddr 6 | 7 | # Load configuration file 8 | with open('config.json', 'r') as fp: 9 | conf = json.load(fp) 10 | 11 | 12 | def send_email(e): 13 | email_config = conf.get('email') 14 | if email_config is not None \ 15 | and email_config['email_address'] != '' \ 16 | and email_config['smtp_server'] != '' \ 17 | and email_config['smtp_port'] != '' \ 18 | and email_config['username'] != '' \ 19 | and email_config['password'] != '': 20 | # Send an email 21 | sender = email_config['email_address'] 22 | recipient = sender 23 | smtp_server = email_config['smtp_server'] 24 | smtp_port = email_config['smtp_port'] 25 | username = email_config['username'] 26 | password = email_config['password'] 27 | 28 | msg = EmailMessage() 29 | msg['From'] = formataddr(('Internet Yellow Pages.', f'{sender}')) 30 | msg['To'] = recipient 31 | msg['Subject'] = 'Error when running create_db script' 32 | msg.set_content( 33 | f"""\ 34 | An exception was caught during run of create_db script:\n\n{str(e)} 35 | """ 36 | ) 37 | msg.add_alternative( 38 | f"""\ 39 | 40 | 41 |

An exception was caught during run of create_db script:

42 |

{str(e)}

43 | 44 | 45 | """, 46 | subtype='html', 47 | ) 48 | 49 | with smtplib.SMTP(smtp_server, smtp_port) as server: 50 | server.starttls() 51 | server.login(username, password) 52 | server.sendmail(sender, recipient, msg.as_string()) 53 | logging.info('Email sent') 54 | server.quit() 55 | else: 56 | print('Email credentials not found in config file') 57 | 58 | 59 | if __name__ == '__main__': 60 | send_email('Error when running Crawler 6') 61 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | version='0.0.1', 5 | name='iyp', 6 | packages=find_packages(), 7 | ) 8 | -------------------------------------------------------------------------------- /unit_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import subprocess 3 | 4 | 5 | def run_crawler(crawler): 6 | print('Running Crawler: ' + crawler) 7 | subprocess.call(['python3', '-m', crawler, 'unit_test']) 8 | 9 | 10 | def run_post_script(post_script): 11 | print('Running Post Script: ' + post_script) 12 | subprocess.call(['python3', '-m', post_script, 'unit_test']) 13 | 14 | 15 | with open('config.json') as config_str: 16 | 17 | config = json.load(config_str) 18 | 19 | # read crawlers info and starrt unit testing of the crawlers 20 | crawlers = config['iyp']['crawlers'] 21 | for crawler in crawlers: 22 | run_crawler(crawler) 23 | 24 | # read post scripts info and start unit testing of the post scripts 25 | post_scripts = config['post'] 26 | for post_script in post_scripts: 27 | run_post_script(post_script) 28 | --------------------------------------------------------------------------------