├── .flake8
├── .git-blame-ignore-revs
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   └── new_dataset.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   └── pre-commit.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── ACKNOWLEDGMENTS.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── autodeploy-config.json
├── autodeploy
    ├── README.md
    └── autodeploy.py
├── basic
    ├── items.csv
    └── properties.csv
├── caddy.template.json
├── config.json.example
├── create_db.py
├── docker-compose.yaml
├── documentation
    ├── README.md
    ├── assets
    │   └── gallery
    │   │   ├── as2497country.svg
    │   │   ├── as2497domainNames.svg
    │   │   ├── as2497ixpCountry.svg
    │   │   ├── as2497names.svg
    │   │   ├── as2501dependencies.svg
    │   │   ├── peeringdbAll.svg
    │   │   ├── prefixAllRelated.svg
    │   │   ├── top10kJapanAS.svg
    │   │   └── top20IranAS.svg
    ├── crawler-best-practices.md
    ├── data-sources.md
    ├── database-management.md
    ├── gallery.md
    ├── node-types.md
    ├── relationship-types.md
    └── writing-a-crawler.md
├── iyp
    ├── __init__.py
    ├── crawlers
    │   ├── __init__.py
    │   ├── alice_lg
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── amsix.py
    │   │   ├── bcix.py
    │   │   ├── decix.py
    │   │   ├── ixbr.py
    │   │   ├── linx.py
    │   │   ├── megaport.py
    │   │   └── netnod.py
    │   ├── apnic
    │   │   ├── README.md
    │   │   └── eyeball.py
    │   ├── bgpkit
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── as2rel_v4.py
    │   │   ├── as2rel_v6.py
    │   │   ├── peerstats.py
    │   │   └── pfx2asn.py
    │   ├── bgptools
    │   │   ├── README.md
    │   │   ├── anycast_prefixes.py
    │   │   ├── as_names.py
    │   │   └── tags.py
    │   ├── caida
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── as2org.py
    │   │   ├── as_relationships_v4.py
    │   │   ├── as_relationships_v6.py
    │   │   ├── asrank.py
    │   │   ├── ix_asns.py
    │   │   └── ixs.py
    │   ├── cisco
    │   │   ├── README.md
    │   │   └── umbrella_top1m.py
    │   ├── citizenlab
    │   │   ├── README.md
    │   │   └── urldb.py
    │   ├── cloudflare
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── dns_top_ases.py
    │   │   ├── dns_top_locations.py
    │   │   ├── ranking_bucket.py
    │   │   └── top100.py
    │   ├── emileaben
    │   │   ├── README.md
    │   │   └── as_names.py
    │   ├── example
    │   │   ├── README.md
    │   │   └── crawler.py
    │   ├── google
    │   │   ├── README.md
    │   │   └── crux_top1m_country.py
    │   ├── iana
    │   │   ├── README.md
    │   │   └── root_zone.py
    │   ├── ihr
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── country_dependency.py
    │   │   ├── local_hegemony_v4.py
    │   │   ├── local_hegemony_v6.py
    │   │   └── rov.py
    │   ├── inetintel
    │   │   ├── README.md
    │   │   └── as_org.py
    │   ├── ipinfo
    │   │   ├── README.md
    │   │   └── ip_country.py
    │   ├── manrs
    │   │   ├── README.md
    │   │   └── members.py
    │   ├── nro
    │   │   ├── README.md
    │   │   └── delegated_stats.py
    │   ├── ooni
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── facebookmessenger.py
    │   │   ├── httpheaderfieldmanipulation.py
    │   │   ├── httpinvalidrequestline.py
    │   │   ├── osignal.py
    │   │   ├── psiphon.py
    │   │   ├── riseupvpn.py
    │   │   ├── stunreachability.py
    │   │   ├── telegram.py
    │   │   ├── tor.py
    │   │   ├── torsf.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   └── grabber.py
    │   │   ├── vanillator.py
    │   │   ├── webconnectivity.py
    │   │   └── whatsapp.py
    │   ├── openintel
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── crux.py
    │   │   ├── dnsgraph_crux.py
    │   │   ├── dnsgraph_jp.py
    │   │   ├── dnsgraph_nl.py
    │   │   ├── dnsgraph_rdns.py
    │   │   ├── infra_mx.py
    │   │   ├── infra_ns.py
    │   │   ├── tranco1m.py
    │   │   └── umbrella1m.py
    │   ├── pch
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── daily_routing_snapshots_v4.py
    │   │   ├── daily_routing_snapshots_v6.py
    │   │   └── show_bgp_parser.py
    │   ├── peeringdb
    │   │   ├── README.md
    │   │   ├── fac.py
    │   │   ├── ix.py
    │   │   └── org.py
    │   ├── ripe
    │   │   ├── README.md
    │   │   ├── as_names.py
    │   │   ├── atlas_measurements.py
    │   │   ├── atlas_probes.py
    │   │   └── roa.py
    │   ├── simulamet
    │   │   ├── README.md
    │   │   └── rirdata_rdns.py
    │   ├── stanford
    │   │   ├── README.md
    │   │   └── asdb.py
    │   ├── tranco
    │   │   ├── README.md
    │   │   └── top1m.py
    │   ├── virginiatech
    │   │   ├── README.md
    │   │   └── rovista.py
    │   └── worldbank
    │   │   ├── README.md
    │   │   └── country_pop.py
    └── post
    │   ├── address_family.py
    │   ├── clean_links.py
    │   ├── country_information.py
    │   ├── ip2prefix.py
    │   └── url2hostname.py
├── log
    └── .gitkeep
├── public
    ├── README.md
    ├── conf_notls
    │   └── neo4j.conf
    └── conf_tls
    │   └── neo4j.conf
├── requirements.txt
├── run_all_crawlers.sh
├── send_email.py
├── setup.py
└── unit_test.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | max-doc-length = 88
4 | extend-ignore = E741
5 | show-source = true
6 | # autopep8 argument
7 | aggressive = 3


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # Run this command to always ignore formatting commits in `git blame`
2 | # git config blame.ignoreRevsFile .git-blame-ignore-revs
3 | 
4 | # Codebase reformat
5 | eec710884f15bf87b1ebe8872802d8ec573a55a8
6 | # Romain's miscellaneous cleanups before codebase reformat
7 | a9638823aaeeeae99150c93074ac3d90fc2a4673
8 | 5247aee3a5d40deaa9e12c06bc92d1ee22a02174
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ""
 5 | labels: ""
 6 | assignees: ""
 7 | ---
 8 | 
 9 | *NOTE (Delete after reading): There is no need to open bug reports based on
10 | error messages in the log of the weekly database dump. We usually notice them
11 | and can judge if a simple rerun of the crawler suffices (e.g., due to a
12 | temporary connectivity issue), or if there is a bug in the crawler.*
13 | 
14 | **Describe the bug**
15 | A clear and concise description of what the bug is.
16 | 
17 | **To Reproduce**
18 | Steps to reproduce the behavior or example query to show the bug.
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Additional context**
27 | Add any other context about the problem here.
28 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ""
 5 | labels: ""
 6 | assignees: ""
 7 | ---
 8 | 
 9 | **Is your feature request related to a problem? Please describe.**
10 | A clear and concise description of what the problem is. 
11 | 
12 | **Describe the solution you'd like**
13 | A clear and concise description of what you want to happen.
14 | 
15 | **Describe alternatives you've considered**
16 | A clear and concise description of any alternative solutions or features you've considered.
17 | 
18 | **Additional context**
19 | Add any other context or screenshots about the feature request here.
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/new_dataset.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Request new dataset
 3 | about: Suggest a dataset to add to the database
 4 | title: ""
 5 | labels: ""
 6 | assignees: ""
 7 | ---
 8 | 
 9 | **Explain the dataset you want to add and how it would contribute to the Internet Yellow Pages.**
10 | A clear and concise description of the dataset.
11 | 
12 | **Provide the name of the organization providing the data and the url to the dataset**
13 | - Org name:
14 | - URL: 
15 | 
16 | **If possible describe how you would like to model the dataset in the Yellow Pages**
17 | - What is the label/type of the nodes used by this dataset?
18 | 
19 | - What is the label/type of the links for this dataset?
20 | 
21 | - If possible give the cypher description of the links that are desired:
22 | 
23 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--- Provide a general summary of your changes in the Title above -->
 2 | 
 3 | ## Description
 4 | 
 5 | <!--- Describe your changes in detail -->
 6 | 
 7 | ## Motivation and Context
 8 | 
 9 | <!--- Why is this change required? What problem does it solve? -->
10 | <!--- If it fixes an open issue, please link to the issue here. -->
11 | 
12 | ## How Has This Been Tested?
13 | 
14 | <!--- Please describe in detail how you tested your changes. -->
15 | <!--- Include details of your testing environment, the tests you ran to -->
16 | <!--- see how your change affects other areas of the code, etc. -->
17 | 
18 | ## Screenshots (if appropriate):
19 | 
20 | ## Types of changes
21 | 
22 | <!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->
23 | 
24 | - [ ] Bug fix (non-breaking change which fixes an issue)
25 | - [ ] New feature (non-breaking change which adds functionality)
26 | - [ ] Breaking change (fix or feature that would cause existing functionality to change)
27 | 
28 | ## Checklist:
29 | 
30 | <!--- Go over all the following points, and put an `x` in all the boxes that apply. -->
31 | <!--- If you're unsure about any of these, don't hesitate to ask. We're here to help! -->
32 | 
33 | - [ ] My code follows the code style of this project.
34 | - [ ] My change requires a change to the documentation.
35 | - [ ] I have updated the documentation accordingly.
36 | 
37 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: Pre-commit CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   pre-commit:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - name: Checkout code
10 |       uses: actions/checkout@v4
11 | 
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v5
14 |       with:
15 |         python-version-file: '.python-version'
16 |         cache: 'pip'
17 | 
18 |     - name: Install dependencies
19 |       run: pip install -r requirements.txt
20 | 
21 |     - name: pre-commit
22 |       uses: pre-commit/action@v3.0.0
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Configuration file
  2 | config.json
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .history/
132 | dumps/
133 | data/
134 | .vscode/
135 | neo4j/
136 | tmp/
137 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | files: \.py$
 2 | repos:
 3 | -   repo: https://github.com/PyCQA/autoflake
 4 |     rev: v2.3.1
 5 |     hooks:
 6 |     -   id: autoflake
 7 |         args: [--in-place]
 8 | -   repo: https://github.com/PyCQA/isort
 9 |     rev: 6.0.1
10 |     hooks:
11 |     -   id: isort
12 |         args: [--thirdparty, neo4j]
13 | -   repo: https://github.com/pre-commit/mirrors-autopep8
14 |     rev: v2.0.4
15 |     hooks:
16 |     -   id: autopep8
17 | -   repo: https://github.com/PyCQA/docformatter
18 |     rev: eb1df347edd128b30cd3368dddc3aa65edcfac38 # Don't autoupdate until https://github.com/PyCQA/docformatter/issues/293 is fixed
19 |     # rev: v1.7.5
20 |     hooks:
21 |     -   id: docformatter
22 |         args: [--in-place, --wrap-summaries, '88', --wrap-descriptions, '88']
23 | -   repo: https://github.com/pre-commit/pre-commit-hooks
24 |     rev: v5.0.0
25 |     hooks:
26 |     -   id: double-quote-string-fixer
27 |     -   id: mixed-line-ending
28 |         args: ['--fix', 'lf']
29 | -   repo: https://github.com/PyCQA/flake8
30 |     rev: 7.1.2
31 |     hooks:
32 |     -   id: flake8
33 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.11
2 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to Internet Health Report
  2 | 
  3 | First off, thanks for taking the time to contribute! 🎉🎉
  4 | 
  5 | When contributing to this repository, please first discuss the change you wish to make via issue
  6 | with the maintainers of this repository before making a change. These are mostly guidelines, not
  7 | rules. Use your best judgment, and feel free to propose changes to this document in a pull request.
  8 | 
  9 | ## Code of Conduct
 10 | 
 11 | This project and everyone participating in it is governed by the [IHR Code of
 12 | Conduct](https://github.com/InternetHealthReport/ihr-website/blob/master/CODE_OF_CONDUCT.md), please
 13 | follow it in all your interaction with the project. By participating, you are expected to uphold
 14 | this code. Please report unacceptable behavior to admin@ihr.live
 15 | 
 16 | ## Pull Request Process
 17 | 
 18 | 1. Make sure that your code is formatted and passed linting according to the project
 19 |    requirements. This can easily be achieved by installing the `pre-commit` command as
 20 |    described below.
 21 | 1. Ensure any new dependencies are added to the `requirements.txt` file.
 22 | 1. Add only relevant files to the commit and ignore the rest to keep the repo clean.
 23 |     - If you add a new dataset / crawler, include a README.md describing the crawler
 24 |        and the nodes / relationships it will push to the database.
 25 |     - If you change the build process, update the general README.md if required.
 26 | 1. You should request review from the maintainers once you submit the Pull Request.
 27 | 
 28 | ## Instructions
 29 | 
 30 | ### Git Workflow
 31 | 
 32 | ```bash
 33 | ## Step 1: Fork Repository
 34 | 
 35 | ## Step 2: Git Set Up & Download
 36 | # Clone the repo
 37 | git clone https://github.com/<User-Name>/<Repo-Name>.git
 38 | # Add upstream remote
 39 | git remote add upstream https://github.com/InternetHealthReport/internet-yellow-pages.git
 40 | # Fetch and merge with upstream/main
 41 | git fetch upstream
 42 | git merge upstream/main
 43 | 
 44 | ## Step 3: Setup Virtual Environment and Install Dependencies
 45 | python3 -m venv --upgrade-deps .venv
 46 | source .venv/bin/activate
 47 | pip install -r requirements.txt
 48 | 
 49 | ## Step 4: Setup pre-commit
 50 | pre-commit install
 51 | 
 52 | ## Step 5: Create and Publish Working Branch
 53 | git checkout -b <type>/<issue|issue-number>/{<additional-fixes>}
 54 | git push origin <type>/<issue|issue-number>/{<additional-fixes>}
 55 | 
 56 | ## Types:
 57 | # wip - Work in Progress; long term work; mainstream changes;
 58 | # feat - New Feature; future planned; non-mainstream changes;
 59 | # bug - Bug Fixes
 60 | # exp - Experimental; random experimental features;
 61 | ```
 62 | 
 63 | ### On Task Completion
 64 | 
 65 | ```bash
 66 | ## Commit and Push Your Work
 67 | # Check branch
 68 | git branch
 69 | # Fetch and merge with upstream/main
 70 | git fetch upstream
 71 | git merge upstream/main
 72 | # Add untracked files
 73 | git add .
 74 | # Commit all changes with appropriate commit message and description
 75 | git commit -m "your-commit-message" -m "your-commit-description"
 76 | # Fetch and merge with upstream/main again
 77 | git fetch upstream
 78 | git merge upstream/main
 79 | # Push changes to your forked repository
 80 | git push origin <type>/<issue|issue-number>/{<additional-fixes>}
 81 | ```
 82 | 
 83 | Create the PR using GitHub Website.
 84 | 
 85 | Create a pull request from `<type>/<issue|issue-number>/{<additional-fixes>}` branch in your forked
 86 | repository to the main branch in the upstream repository.
 87 | 
 88 | After creating the PR, add a reviewer (any admin) and yourself as the assignee. Link the PR to
 89 | appropriate issue, or Project+Milestone (if no issue was created).
 90 | 
 91 | ### After PR Merge
 92 | 
 93 | ```bash
 94 | # Delete branch from forked repo
 95 | git branch -d <type>/<issue|issue-number>/{<additional-fixes>}
 96 | git push --delete origin <type>/<issue|issue-number>/{<additional-fixes>}
 97 | # Fetch and merge with upstream/main
 98 | git checkout main
 99 | git pull upstream
100 | git push origin
101 | ```
102 | 
103 | - Always follow [commit message standards](https://chris.beams.io/posts/git-commit/)
104 | - About the [fork-and-branch workflow](https://blog.scottlowe.org/2015/01/27/using-fork-branch-git-workflow/)
105 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Internet Yellow Pages
  2 | 
  3 | The Internet Yellow Pages (IYP) is a knowledge database that gathers information about
  4 | Internet resources (for example ASNs, IP prefixes, and domain names).
  5 | 
  6 | ## Public IYP prototype
  7 | 
  8 | Visit <https://iyp.iijlab.net> to try our online prototype. You will find instructions
  9 | on how to connect to the prototype and some example queries there. For even more
 10 | examples, check out the [IYP
 11 | gallery](documentation/gallery.md).
 12 | 
 13 | ## Deploy a local IYP instance
 14 | 
 15 | We describe the basic process of deploying a local IYP instance below. For more advanced
 16 | commands see the [database documentation](documentation/database-management.md).
 17 | 
 18 | ### Prerequisites
 19 | 
 20 | - [Curl](https://curl.se/download.html)
 21 | - [Docker](https://www.docker.com/)
 22 | - [Docker Compose](https://docs.docker.com/compose/install/)
 23 | - about 100GB of free disk space
 24 | - at least 2GB of RAM
 25 | 
 26 | ### Download the database dump
 27 | 
 28 | Visit the [database dump repository](https://archive.ihr.live/ihr/iyp/).
 29 | 
 30 | Dumps are organized by year, month, and day in this format:
 31 | 
 32 | ```text
 33 | https://archive.ihr.live/ihr/iyp/YYYY/MM/DD/iyp-YYYY-MM-DD.dump
 34 | ```
 35 | 
 36 | Replace `YYYY`, `MM`, and `DD` in the URL with the desired date to access a specific
 37 | database dump.
 38 | 
 39 | The dump file needs to be called `neo4j.dump` and needs to be put in a folder called
 40 | `dumps` (`dumps/neo4j.dump`).
 41 | To create the folder and download a dump with `curl`:
 42 | 
 43 | ```bash
 44 | mkdir dumps
 45 | curl https://archive.ihr.live/ihr/iyp/YYYY/MM/DD/iyp-YYYY-MM-DD.dump -o dumps/neo4j.dump
 46 | ```
 47 | 
 48 | ### Set up IYP
 49 | 
 50 | To uncompress the dump and start the database run the following command:
 51 | 
 52 | ```bash
 53 | mkdir -p data
 54 | uid="$(id -u)" gid="$(id -g)" docker compose --profile local up
 55 | ```
 56 | 
 57 | This creates a `data` directory containing the database, load the database dump, and
 58 | start the local IYP instance. This initial setup needs be done only once but it takes
 59 | some time to completely load the database and start IYP. Please wait until IYP is fully
 60 | loaded. Also this step won't work if the data directory already contains a database.
 61 | 
 62 | This setup keeps the database instance running in the foreground. It can be stopped with
 63 | `Ctrl+C`. Afterwards, you can simply [start/stop](#startstop-iyp) IYP in the background
 64 | to use it. To update the database with a new dump see [Update existing
 65 | database](documentation/database-management.md#update-existing-database).
 66 | 
 67 | ### Start/Stop IYP
 68 | 
 69 | To start the database, run the following command:
 70 | 
 71 | ```bash
 72 | docker start iyp
 73 | ```
 74 | 
 75 | To stop the database, run the following command:
 76 | 
 77 | ``` bash
 78 | docker stop iyp
 79 | ```
 80 | 
 81 | ### Query the database
 82 | 
 83 | Open <http://localhost:7474> in your favorite browser. To connect the interface to the database give
 84 | the default login and password: `neo4j` and `password` respectively. Then enter your query in the top input field.
 85 | 
 86 | For example, this finds the IXPs and corresponding country codes where IIJ (AS2497) is:
 87 | 
 88 | ```cypher
 89 | MATCH (iij:AS {asn:2497})-[:MEMBER_OF]-(ix:IXP)--(cc:Country)
 90 | RETURN iij, ix, cc
 91 | ```
 92 | 
 93 | ![Countries of IXPs where AS2497 is present](/documentation/assets/gallery/as2497ixpCountry.svg)
 94 | 
 95 | ### IYP gallery
 96 | 
 97 | See more query examples in [IYP gallery](/documentation/gallery.md)
 98 | 
 99 | ## Contributing
100 | 
101 | Want to [propose a new dataset](documentation/README.md#add-new-datasets) or [implement
102 | a crawler](documentation/writing-a-crawler.md)? Checkout the
103 | [documentation](documentation/README.md) for more info.
104 | 
105 | ## Changelog
106 | 
107 | See: <https://github.com/InternetHealthReport/internet-yellow-pages/releases>
108 | 
109 | ## External links
110 | 
111 | - [Public instance of IYP](https://iyp.iijlab.net)
112 | - [RIPE86 presentation](https://ripe86.ripe.net/archives/video/1073/)
113 | - [APNIC blog article](https://blog.apnic.net/2023/09/06/understanding-the-japanese-internet-with-the-internet-yellow-pages/)
114 | 


--------------------------------------------------------------------------------
/autodeploy-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "archive_base_url": "https://archive.ihr.live/ihr/iyp/",
 3 |   "caddy_config_url": "http://sandbox.ihr.live:2019/config",
 4 |   "caddy_post_url": "http://localhost:2019/load",
 5 |   "caddy_template": "caddy.template.json",
 6 |   "urls": {
 7 |     "active_bolt": "ryan-bolt.ihr.live",
 8 |     "active_http": "ryan.ihr.live",
 9 |     "prev_bolt": "ryan-prev-bolt.ihr.live",
10 |     "prev_http": "ryan-prev.ihr.live"
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/autodeploy/README.md:
--------------------------------------------------------------------------------
 1 | # Autodeployment Script
 2 | 
 3 | ## Usage
 4 | 
 5 | ### Starting caddy
 6 | 
 7 | Make sure that Caddy is running. If not, run it with `docker compose up caddy`. If Caddy
 8 | was running previously, then the new Caddy instance will resume from the previous
 9 | config. See the [Caddy docs](https://caddyserver.com/docs/running#docker-compose) for
10 | more info.
11 | 
12 | ### Running the script
13 | 
14 | To run the script, run `python3 -m autodeploy.autodeploy <config>`. This will first find the date
15 | of the most recent active deployment using the caddy config. If there is no active
16 | deployment, today's date is used. With this date, the script will then check ihr-archive
17 | to see if a dump has been pushed in the subsequent 7 days. If so, a neo4j instance will
18 | be deployed using that dump. For example, if the latest deployment is for 2024-06-15,
19 | the script will check if there is a dump for 2024-06-16 to 2024-06-23.
20 | 
21 | Alternatively, running `python3 -m autodeploy.autodeploy <config> --date [year]-[month]-[day]` will
22 | check if there is a dump in the archive for the specified date and deploy it directly.
23 | 
24 | ## How it works
25 | 
26 | ### Checking for a dump to deploy
27 | 
28 | If the date is not provided when running the script, it will first make a request to
29 | Caddy to get the current config. The config is parsed to retrieve the port of the active
30 | database. The date is parsed from the port number as explained below. Starting from this
31 | date, the next 7 days are then checked in ihr-archive for valid dumps.
32 | 
33 | #### Caddy Config
34 | 
35 | Caddy is updated by substituting the desired ports in the specified Caddy config
36 | template. The ports are constructed with the following structure: 1MMDD for neo4j http
37 | port, and 2MMDD for neo4j bolt port. The json is sent to caddy by making a POST request
38 | to sandbox.ihr.live:2019/load.  The current config is retrieved by making a GET request
39 | to sandbox.ihr.live:2019/config.
40 | 
41 | ### Starting the database
42 | 
43 | Once a dump has been found, its log is downloaded from the archive. If the log indicates
44 | that there are no errors, then the dump is downloaded. A docker container is then
45 | started that loads the dump into a neo4j database. The database is stored in a docker
46 | volume with the name data-MM-DD. Another container is then used to start the database
47 | using the data stored in data-MM-YY. It binds its internal neo4j 7474 and 7687 ports to
48 | the external ones that contain the dump's date.
49 | 
50 | If a container is already running for this date, it and its data volume are deleted, and
51 | a new one is created from the downloaded dump data.
52 | 
53 | If there was already an active database, it becomes the previous database. The current
54 | previous database container is stopped, and its data volume is deleted.
55 | 


--------------------------------------------------------------------------------
/basic/items.csv:
--------------------------------------------------------------------------------
 1 | #label, description, aliases, statements
 2 | #basic property, "Fundamental property for this wikibase",,
 3 | #basic item, "Fundamental item for this wikibase",,
 4 | autonomous system, "network or group of networks that has a unified routing policy", AS, 
 5 | organization, "social entity (not necessarily commercial) uniting people into a structured group managing shared means to meet some needs, or to pursue collective goals", org|organisation|organizations|organisations,
 6 | Internet exchange point, "physical infrastructure through which Internet service providers (ISPs) and content delivery networks (CDNs) exchange Internet traffic between their networks", ix|IX|ixp|IXP|exchange point|internet exchange,
 7 | country, "sovereign state of this item",sovereign state|state|land|host country,
 8 | regional Internet registry, "organization that manages the allocation and registration of Internet number resources within a region of the world", RIR,instance of:organization
 9 | RIPE NCC, "regional Internet registry for Europe, West Asia, and the former USSR", RIPE,instance of:regional Internet registry
10 | APNIC, "regional Internet address registry (RIR) for the Asia-Pacific region", , instance of:regional Internet registry
11 | ARIN, "regional Internet Registry for Canada, the United States, and many Caribbean and North Atlantic islands", , instance of:regional Internet registry
12 | LACNIC, "regional Internet registry for the Latin American and Caribbean regions" , ,instance of:regional Internet registry
13 | AFRINIC, "regional Internet registry for Africa", , instance of:regional Internet registry
14 | computer network protocol, "communications protocol used by data networks",protocol|network protocol|data network protocol,
15 | IPv4, "Internet Protocol version 4",ipv4,instance of:computer network protocol
16 | IPv6, "Internet Protocol version 6",ipv6,instance of:computer network protocol
17 | TCP, "Transmission Control Protocol",tcp,instance of:computer network protocol
18 | UDP, "User Datagram Protocol",udp,instance of:computer network protocol
19 | ICMP, "Internet Control Message Protocol",icmp,instance of:computer network protocol
20 | IP address, "numerical label used to identify a network interface on an IP network",,
21 | IP routing prefix, "part of an IP address that identifies a network often expressed using Classless Inter-Domain Routing (CIDR) notation.", prefix|subnet|IP prefix|address prefix|network prefix,implements:IPv4
22 | PeeringDB, "PeeringDB is a freely available, user-maintained, database of networks, and the go-to location for interconnection data.",peeringdb|peeringDB,instance of:organization
23 | Internet exchange point, "An Internet exchange point is the physical infrastructure through which Internet service providers and content delivery networks (CDNs) exchange Internet traffic between their networks",IXP|IX|ix|ixp,
24 | MANRS, "MANRS is an Internet Society-supported activity aimed at securing global Internet routing",Mutually Agreed Norms for Routing Security,instance of:organization
25 | CAIDA, "Center for Applied Internet Data Analysis",,instance of:organization
26 | Internet Health Report, "IIJ's Internet Health Report",,instance of:organization
27 | Route Views, "project founded by Advanced Network Technology Center at the University of Oregon to allow Internet users to view global Border Gateway Protocol routing information",routeviews|Routeviews|RouteViews,instance of:organization
28 | traffic statistics, "URL for public traffic data",traffic data|statistics page,
29 | route server,"BGP route server is a feature designed for internet exchange (IX) operators that provides an alternative to full eBGP mesh peering among the service providers who have a presence at the IX",routeserver|rs,
30 | domain name, "identification string that defines a realm of administrative autonomy, authority or control within the Internet",,
31 | peering LAN, "LAN used for exchanging traffic between multiple organizations, usually managed by an IXP", ,instance of:IP routing prefix
32 | 


--------------------------------------------------------------------------------
/caddy.template.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "apps": {
  3 |       "http": {
  4 |         "servers": {
  5 |           "srv0": {
  6 |             "listen": [":443"],
  7 |             "routes": [
  8 |               {
  9 |                 "match": [{ "host": ["ryan-bolt.ihr.live"] }],
 10 |                 "handle": [
 11 |                   {
 12 |                     "handler": "subroute",
 13 |                     "routes": [
 14 |                       {
 15 |                         "handle": [
 16 |                           {
 17 |                             "handler": "reverse_proxy",
 18 |                             "upstreams": [{ "dial": "sandbox.ihr.live:<BOLT_PORT>" }]
 19 |                           }
 20 |                         ]
 21 |                       }
 22 |                     ]
 23 |                   }
 24 |                 ],
 25 |                 "terminal": true
 26 |               },
 27 |               {
 28 |                 "match": [{ "host": ["ryan-prev-bolt.ihr.live"] }],
 29 |                 "handle": [
 30 |                   {
 31 |                     "handler": "subroute",
 32 |                     "routes": [
 33 |                       {
 34 |                         "handle": [
 35 |                           {
 36 |                             "handler": "reverse_proxy",
 37 |                             "upstreams": [{ "dial": "sandbox.ihr.live:<PREV_BOLT_PORT>" }]
 38 |                           }
 39 |                         ]
 40 |                       }
 41 |                     ]
 42 |                   }
 43 |                 ],
 44 |                 "terminal": true
 45 |               },
 46 |               {
 47 |                 "match": [{ "host": ["sandbox.ihr.live"] }],
 48 |                 "handle": [
 49 |                   {
 50 |                     "handler": "subroute",
 51 |                     "routes": [
 52 |                       {
 53 |                         "handle": [
 54 |                           {
 55 |                             "handler": "reverse_proxy",
 56 |                             "upstreams": [{ "dial": "ryan.ihr.live:<HTTP_PORT>" }]
 57 |                           }
 58 |                         ]
 59 |                       }
 60 |                     ]
 61 |                   }
 62 |                 ],
 63 |                 "terminal": true
 64 |               },
 65 |               {
 66 |                 "match": [{ "host": ["ryan.ihr.live"] }],
 67 |                 "handle": [
 68 |                   {
 69 |                     "handler": "subroute",
 70 |                     "routes": [
 71 |                       {
 72 |                         "handle": [
 73 |                           {
 74 |                             "handler": "reverse_proxy",
 75 |                             "upstreams": [{ "dial": "sandbox.ihr.live:<HTTP_PORT>" }]
 76 |                           }
 77 |                         ]
 78 |                       }
 79 |                     ]
 80 |                   }
 81 |                 ],
 82 |                 "terminal": true
 83 |               },
 84 |               {
 85 |                 "match": [{ "host": ["ryan-prev.ihr.live"] }],
 86 |                 "handle": [
 87 |                   {
 88 |                     "handler": "subroute",
 89 |                     "routes": [
 90 |                       {
 91 |                         "handle": [
 92 |                           {
 93 |                             "handler": "reverse_proxy",
 94 |                             "upstreams": [{ "dial": "sandbox.ihr.live:<PREV_HTTP_PORT>" }]
 95 |                           }
 96 |                         ]
 97 |                       }
 98 |                     ]
 99 |                   }
100 |                 ],
101 |                 "terminal": true
102 |               }
103 |             ]
104 |           }
105 |         }
106 |       }
107 |     }
108 |   }
109 |   


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   iyp_loader:
 3 |     image: neo4j/neo4j-admin:5.26.2-community-debian
 4 |     profiles: ["local", "public_tls", "public_notls"]
 5 |     user: "${uid}:${gid}"
 6 |     container_name: iyp_loader
 7 |     tty: true
 8 |     stdin_open: true
 9 |     volumes:
10 |       - ./data:/data
11 |       - ./dumps:/dumps
12 |     command: neo4j-admin database load neo4j --from-path=/dumps --verbose
13 | 
14 |   iyp:
15 |     image: neo4j:5.26.3
16 |     profiles: ["local"]
17 |     user: "${uid}:${gid}"
18 |     container_name: iyp
19 |     restart: unless-stopped
20 |     ports:
21 |       - 7474:7474
22 |       - 7687:7687
23 |     volumes:
24 |       - ./data:/data
25 |     environment:
26 |       - NEO4J_AUTH=neo4j/password
27 |     depends_on:
28 |       iyp_loader:
29 |         condition: service_completed_successfully
30 | 
31 |   iyp_readonly_tls:
32 |     image: neo4j:5.26.3
33 |     profiles: ["public_tls"]
34 |     user: "${uid}:${gid}"
35 |     container_name: iyp
36 |     restart: unless-stopped
37 |     ports:
38 |       - 7473:7473
39 |       - 7687:7687
40 |     volumes:
41 |       - ./data:/data
42 |       - ./public/certificates:/ssl
43 |       - ./public/conf_tls:/conf
44 |       - ./public/logs:/logs
45 |     environment:
46 |       - NEO4J_AUTH=neo4j/password
47 |     depends_on:
48 |       iyp_loader:
49 |         condition: service_completed_successfully
50 | 
51 |   iyp_readonly_notls:
52 |     image: neo4j:5.26.3
53 |     profiles: ["public_notls"]
54 |     user: "${uid}:${gid}"
55 |     container_name: iyp
56 |     restart: unless-stopped
57 |     ports:
58 |       - 7474:7474
59 |       - 7687:7687
60 |     volumes:
61 |       - ./data:/data
62 |       - ./public/conf_notls:/conf
63 |       - ./public/logs:/logs
64 |     environment:
65 |       - NEO4J_AUTH=neo4j/password
66 |     depends_on:
67 |       iyp_loader:
68 |         condition: service_completed_successfully
69 | 
70 |   caddy:
71 |     image: caddy:latest
72 |     profiles: ["caddy"]
73 |     user: "${uid}:${gid}"
74 |     container_name: caddy
75 |     restart: unless-stopped
76 |     ports:
77 |       - "80:80"
78 |       - "443:443"
79 |       - "2019:2019"
80 |     environment:
81 |       - CADDY_ADMIN=0.0.0.0:2019
82 |     volumes:
83 |       - ./site:/srv
84 |       - caddy_data:/data
85 |       - caddy_config:/config
86 |     command: /usr/bin/caddy run --resume
87 |       
88 | volumes:
89 |   caddy_data:
90 |   caddy_config:
91 | 


--------------------------------------------------------------------------------
/documentation/README.md:
--------------------------------------------------------------------------------
 1 | # IYP documentation
 2 | 
 3 | ## Ontology
 4 | 
 5 | The list of node and relationship types defined for IYP are available at:
 6 | 
 7 | - [Node types](./node-types.md)
 8 | - [Relationship types](./relationship-types.md)
 9 | 
10 | ## Data sources
11 | 
12 | The list of all datasets imported in IYP is available [here](data-sources.md).
13 | The dataset licenses are available the [acknowledgments](../ACKNOWLEDGMENTS.md).
14 | 
15 | ## Gallery
16 | 
17 | The [IYP gallery](./gallery.md) provides example queries to help users browse the
18 | database.
19 | 
20 | ## Add new datasets
21 | 
22 | ### Propose a new dataset
23 | 
24 | Have an idea for a dataset that should be integrated into IYP? Feel free to propose it
25 | by opening a new
26 | [discussion](https://github.com/InternetHealthReport/internet-yellow-pages/discussions).
27 | You should describe the dataset, why it is potentially useful, and, if possible, provide
28 | some initial idea for modeling the data.
29 | 
30 | The discussion is used to decide if we want to integrate the dataset and how to model
31 | it. So feel free to propose a dataset even if you have no concrete model in mind.
32 | 
33 | ### Import a new dataset
34 | 
35 | If it was decided that the dataset should be integrated into IYP, we will convert the
36 | discussion into a [GitHub
37 | issue](https://github.com/InternetHealthReport/internet-yellow-pages/issues). At this
38 | stage it is open to anyone who wants to implement a crawler for the dataset.
39 | 
40 | For a detailed description on how to write your first crawler and contribute to IYP take
41 | a look at the [IHR contributing guidelines](../CONTRIBUTING.md) and the [crawler
42 | instructions](writing-a-crawler.md).
43 | 


--------------------------------------------------------------------------------
/documentation/assets/gallery/as2497country.svg:
--------------------------------------------------------------------------------
1 | ﻿<svg xmlns="http://www.w3.org/2000/svg" width="66.51453399658203" height="263.0552978515625" viewBox="-33.257266998291016 -89.15162658691406 66.51453399658203 263.0552978515625"><title>Neo4j Graph Visualization</title><desc>Created using Neo4j (http://www.neo4j.com/)</desc><g class="layer relationships"><g class="relationship" transform="translate(-5.416145445561315e-15 81.19402756626904) rotate(270)"><path class="b-outline" fill="#A5ABB6" stroke="none" d="M 21.40063509461097,-12.933012701892217 L 21.90063509461097,-12.06698729810778 A 94.721514570253 94.721514570253 0 0 1 44.427241066541086,-21.443791990257452 L 44.16506038429727,-22.40881079310958 A 95.721514570253 95.721514570253 0 0 0 21.40063509461097,-12.933012701892217 Z M 89.94536784326442,-23.49580241611278 L 89.7292829269696,-22.519427842012306 A 94.721514570253 94.721514570253 0 0 1 112.67553222720032,-14.222314735577163 L 111.30052866931496,-11.555975444376102 L 119.1261578329837,-11.458362982378143 L 114.50887030438082,-17.77743379051191 L 113.13386674649546,-15.11109449931085 A 95.721514570253 95.721514570253 0 0 0 89.94536784326442,-23.49580241611278"/><text text-anchor="middle" pointer-events="none" font-size="8px" fill="#000000" x="67.00112304765041" y="-22.230434223840717" font-family="Helvetica Neue, Helvetica, Arial, sans-serif">COUNTRY</text></g><g class="relationship" transform="translate(-5.416145445561315e-15 81.19402756626904) rotate(270)"><path class="b-outline" fill="#A5ABB6" stroke="none" d="M 25 0.5 L 44.17282596316299 0.5 L 44.17282596316299 -0.5 L 25 -0.5 Z M 90.17282596316299 0.5 L 109.34565192632598 0.5 L 109.34565192632598 3.5 L 116.34565192632598 0 L 109.34565192632598 -3.5 L 109.34565192632598 -0.5 L 90.17282596316299 -0.5 Z"/><text text-anchor="middle" pointer-events="none" font-size="8px" fill="#000000" x="67.17282596316299" y="3" font-family="Helvetica Neue, Helvetica, Arial, sans-serif">COUNTRY</text></g><g class="relationship" transform="translate(-5.416145445561315e-15 81.19402756626904) rotate(270)"><path class="b-outline" fill="#A5ABB6" stroke="none" d="M 21.40063509461097,12.933012701892217 L 21.90063509461097,12.06698729810778 A 94.721514570253 94.721514570253 0 0 0 40.13679312877659,20.168557190015576 L 39.82931705659702,21.12011300290152 A 95.721514570253 95.721514570253 0 0 1 21.40063509461097,12.933012701892217 Z M 94.33738757906347,22.41433358228693 L 94.07541935476296,21.449257083009897 A 94.721514570253 94.721514570253 0 0 0 112.67553222720032,14.222314735577163 L 111.30052866931496,11.555975444376102 L 119.1261578329837,11.458362982378143 L 114.50887030438082,17.77743379051191 L 113.13386674649546,15.11109449931085 A 95.721514570253 95.721514570253 0 0 1 94.33738757906347,22.41433358228693"/><text text-anchor="middle" pointer-events="none" font-size="8px" fill="#000000" x="67.00112304765044" y="28.23043422384073" font-family="Helvetica Neue, Helvetica, Arial, sans-serif">POPULATION</text></g><g class="relationship" transform="translate(-5.416145445561315e-15 81.19402756626904) rotate(360)"><path class="b-outline" fill="#A5ABB6" stroke="none" d="M 6.953439040707552 24.018736134675446 L 17.306200844808384 62.65576918623819 A 17.916697508022974 17.916697508022974 0 1 1 -17.306200844808384 62.65576918623819 L -8.765172356425198 30.780216918698923 L -4.901469051268926 31.815493099109005 L -6.4704761275630185 24.148145657226706 L -11.662949835292403 30.00375978339136 L -7.79924653013613 31.039035963801442 L -16.340275018519314 62.91458823134071 A 16.916697508022974 16.916697508022974 0 1 0 16.340275018519314 62.91458823134071 L 5.987513214418485 24.277555179777966 Z"/><text text-anchor="middle" pointer-events="none" font-size="8px" fill="#000000" x="0" y="94.20964923467838" font-family="Helvetica Neue, Helvetica, Arial, sans-serif"/></g></g><g class="layer nodes"><g class="node" aria-label="graph-node320" transform="translate(-5.416145445561315e-15,81.19402756626904)"><circle class="b-outline" cx="0" cy="0" r="25" fill="#FFC454" stroke="#d7a013" stroke-width="2px"/><text class="caption" text-anchor="middle" pointer-events="none" x="0" y="5" font-size="10px" fill="#2A2C34" font-family="Helvetica Neue, Helvetica, Arial, sans-serif"> 2497</text></g><g class="node" aria-label="graph-node93" transform="translate(4.051269790650251e-15,-60.151624360056935)"><circle class="b-outline" cx="0" cy="0" r="25" fill="#57C7E3" stroke="#23b3d7" stroke-width="2px"/><text class="caption" text-anchor="middle" pointer-events="none" x="0" y="5" font-size="10px" fill="#2A2C34" font-family="Helvetica Neue, Helvetica, Arial, sans-serif"> JP</text></g></g></svg>


--------------------------------------------------------------------------------
/documentation/database-management.md:
--------------------------------------------------------------------------------
 1 | # Advanced database commands
 2 | 
 3 | Here we describe some additional useful commands to handle IYP dumps.
 4 | 
 5 | ## Update existing database
 6 | 
 7 | If you [setup the database](../README.md#set-up-iyp) you can load a new dump without
 8 | recreating the Docker containers. Place the new dump at `dumps/neo4j.dump`, delete the
 9 | existing database and run only the loader again:
10 | 
11 | ```bash
12 | # If the database is running, stop it.
13 | # docker stop iyp
14 | # Delete the existing database
15 | rm -r data/*
16 | # Run the loader
17 | docker start -i iyp_loader
18 | # Start the database.
19 | docker start iyp
20 | ```
21 | 
22 | ## Save modified database
23 | 
24 | If you did changes to the database and want to dump the contents into a file, you can
25 | use the loader for this. For example, to dump the database into a folder called
26 | `backups`:
27 | 
28 | ```bash
29 | # Directory has to exist or it will be created as root by Docker.
30 | mkdir -p backups
31 | uid="$(id -u)" gid="$(id -g)" docker compose run --rm -i -v "$PWD/backups:/backups" iyp_loader neo4j-admin database dump neo4j --to-path=/backups --verbose --overwrite-destination
32 | ```
33 | 
34 | This will create a file called `neo4j.dump` in the `backups` folder. **Note that this
35 | will also overwrite this file if it exists!**
36 | 
37 | ## View Neo4j logs
38 | 
39 | To view the logs of the Neo4j container, use the following command:
40 | 
41 | ```bash
42 | docker logs -f iyp
43 | ```
44 | 
45 | ## Create a new dump from scratch
46 | 
47 | **Enabling all crawlers will download a lot of data and take multiple days to create a
48 | dump.**
49 | 
50 | Clone this repository:
51 | 
52 | ```bash
53 | git clone https://github.com/InternetHealthReport/internet-yellow-pages.git
54 | cd internet-yellow-pages
55 | ```
56 | 
57 | Create Python environment and install Python libraries:
58 | 
59 | ```bash
60 | python3 -m venv --upgrade-deps .venv
61 | source .venv/bin/activate
62 | pip install -r requirements.txt
63 | ```
64 | 
65 | Create a configuration file from the example file and add API keys. **Note that some
66 | crawlers do not work without credentials.**
67 | 
68 | ```bash
69 | cp config.json.example config.json
70 | # Edit as needed
71 | ```
72 | 
73 | Create and populate a new database:
74 | 
75 | ```bash
76 | python3 create_db.py
77 | ```
78 | 


--------------------------------------------------------------------------------
/documentation/gallery.md:
--------------------------------------------------------------------------------
  1 | # IYP Gallery
  2 | 
  3 | Below are examples queries that you can copy/paste in [Neo4j browser](https://iyp.iijlab.net/iyp/browser/?dbms=iyp-bolt.iijlab.net:443).
  4 | 
  5 | Querying the IYP database requires to be familiar with:
  6 | - Cypher, Neo4j's query langage https://neo4j.com/docs/getting-started/current/cypher-intro/
  7 | - Basic networking knowledge (IP, prefixes, ASes, etc..)
  8 | - [IYP ontology](./README.md)
  9 | 
 10 | ### Names for AS2497
 11 | Find 'Name' nodes directly connected to the node corresponding to AS2497.
 12 | ```cypher
 13 | MATCH (a:AS {asn:2497})--(n:Name) RETURN a,n
 14 | ```
 15 | ![Names for AS2497](/documentation/assets/gallery/as2497names.svg)
 16 | 
 17 | 
 18 | ### All nodes related to 8.8.8.0/24
 19 | Find nodes of any type that are connected to the node corresponding to prefix 
 20 | 8.8.8.0/24.
 21 | ```cypher
 22 | MATCH (gdns:Prefix {prefix:'8.8.8.0/24'})--(neighbor)
 23 | RETURN gdns, neighbor
 24 | ```
 25 | ![All nodes related to 8.8.8.0/24](/documentation/assets/gallery/prefixAllRelated.svg)
 26 | 
 27 | 
 28 | ### Country code of AS2497 in delegated files
 29 | Here we search for a country node directly connected to AS2497's node and that
 30 | comes from NRO's delegated stats.
 31 | ```cypher
 32 | MATCH (iij:AS {asn:2497})-[{reference_name:'nro.delegated_stats'}]-(cc:Country)
 33 | RETURN iij, cc
 34 | ```
 35 | ![Country code of AS2497 in delegated files](/documentation/assets/gallery/as2497country.svg)
 36 | 
 37 | 
 38 | ### Countries of IXPs where AS2497 is present
 39 | Starting from the node corresponding to AS2497, find IXPs where AS2497 is member
 40 | of, and then the country corresponding to each IXP.
 41 | ```cypher
 42 | MATCH (iij:AS {asn:2497})-[:MEMBER_OF]-(ix:IXP)--(cc:Country)
 43 | RETURN iij, ix, cc
 44 | ```
 45 | ![Countries of IXPs where AS2497 is present](/documentation/assets/gallery/as2497ixpCountry.svg)
 46 | 
 47 | 
 48 | ### Top domain names hosted by AS2497
 49 | Select domain names in top 50k rankings that resolves to an IP originated by
 50 | AS2497.
 51 | ```cypher
 52 | MATCH (:Ranking)-[r:RANK]-(dn:DomainName)-[:PART_OF]-(hn:HostName)-[:RESOLVES_TO]-(ip:IP)--(pfx:Prefix)-[:ORIGINATE]-(iij:AS {asn:2497})
 53 | WHERE r.rank < 50000 AND dn.name = hn.name
 54 | RETURN hn, ip, pfx, iij
 55 | ```
 56 | ![Top domain names hosted by AS2497](/documentation/assets/gallery/as2497domainNames.svg)
 57 | 
 58 | 
 59 | ### ASes hosting top domain names in Japan
 60 | From the top 10k domain names select domain names that ends with '.jp', the
 61 | corresponding IP, prefix, and ASN.
 62 | ```cypher
 63 | MATCH (:Ranking)-[r:RANK]-(dn:DomainName)-[:PART_OF]-(hn:HostName)-[rt:RESOLVES_TO]-(ip:IP)-[po:PART_OF]-(pfx:Prefix)-[o:ORIGINATE]-(net:AS)
 64 | WHERE dn.name ENDS WITH '.jp' AND r.rank<10000 AND dn.name = hn.name
 65 | RETURN hn, ip, pfx, net, rt, po, o
 66 | ```
 67 | ![ASes hosting top domain names in Japan](/documentation/assets/gallery/top10kJapanAS.svg)
 68 | 
 69 | ### Topology for top ASes in Iran
 70 | Select IHR's top 20 ASes in Iran and show how they are connected to each other using AS relationships.
 71 | ```cypher
 72 | MATCH (a:AS)-[ra:RANK]->(:Ranking {name: 'IHR country ranking: Total AS (IR)'})<-[rb:RANK]-(b:AS)-[p:PEERS_WITH]-(a)
 73 | WHERE ra.rank < 20 AND rb.rank < 20 AND p.rel = 0
 74 | RETURN a, p, b
 75 | ```
 76 | ![Top ASes connecting Iran](/documentation/assets/gallery/top20IranAS.svg)
 77 | 
 78 | ### Topology for AS2501's dependencies
 79 | Select AS dependencies for AS2501 and find the shortest PEERS_WITH relationship to these ASes.
 80 | ```cypher
 81 | MATCH (a:AS {asn:2501})-[h:DEPENDS_ON {af:4}]->(d:AS)
 82 | WITH a, COLLECT(DISTINCT d) AS dependencies
 83 | UNWIND dependencies as d
 84 | MATCH p = allShortestPaths((a)-[:PEERS_WITH*]-(d))
 85 | WHERE a.asn <> d.asn AND all(r IN relationships(p) WHERE r.af = 4) AND all(n IN nodes(p) WHERE n IN dependencies)
 86 | RETURN p
 87 | ```
 88 | ![Dependencies for AS2501](/documentation/assets/gallery/as2501dependencies.svg)
 89 | 
 90 | ### List of IPs for RIPE RIS full feed peers (more than 800k prefixes)
 91 | 
 92 | ```cypher
 93 | MATCH (n:BGPCollector)-[p:PEERS_WITH]-(a:AS) 
 94 | WHERE n.project = 'riperis' AND p.num_v4_pfxs > 800000 
 95 | RETURN n.name, COUNT(DISTINCT p.ip) AS nb_full, COLLECT(DISTINCT p.ip) AS ips_full
 96 | ```
 97 | 
 98 | ### Active RIPE Atlas probes for the top 5 ISPs in Japan
 99 | 
100 | ```cypher
101 | MATCH (pb:AtlasProbe)-[:LOCATED_IN]-(a:AS)-[pop:POPULATION]-(c:Country) 
102 | WHERE c.country_code = 'JP' AND pb.status_name = 'Connected' AND pop.rank <= 5 
103 | RETURN pop.rank, a.asn, COLLECT(pb.id) AS probe_ids ORDER BY pop.rank
104 | ```
105 | 


--------------------------------------------------------------------------------
/documentation/writing-a-crawler.md:
--------------------------------------------------------------------------------
 1 | # How to write your first crawler
 2 | 
 3 | To import a new dataset into IYP, you should write a crawler for that dataset. The main
 4 | tasks of a crawler are to fetch data, parse it, model it with IYP ontology, and push it
 5 | to the IYP database. Most of these tasks are assisted by the [IYP python
 6 | library](../iyp/__init__.py). See the [example
 7 | crawler](../iyp/crawlers/example/crawler.py) or [existing crawlers](../iyp/crawlers/)
 8 | for getting started.
 9 | See also the [IHR contributing guidelines](../CONTRIBUTING.md) and [best practices for
10 | writing crawlers](crawler-best-practices.md).
11 | 
12 | ## README
13 | 
14 | Each crawler should be accompanied by a README.md file. This is the main documentation
15 | for the crawler, it should contain:
16 | 
17 | - a short description of the dataset,
18 | - any specificities related to the way the data is imported (e.g., time span, data cleaning),
19 | - examples of how the data is modeled,
20 | - dependencies to other crawlers (e.g., if the crawler requires data from another one).
21 | 
22 | ## Adding a crawler to IYP main branch
23 | 
24 | If you wish your crawler to be part of the IYP weekly dumps, you can submit a [Pull
25 | Request](https://github.com/InternetHealthReport/internet-yellow-pages/pulls) to include
26 | the crawler to IYP's GitHub repository main branch.
27 | 
28 | Along with the Python code and README, the addition of new datasets should also be
29 | reflected in the following files:
30 | 
31 | - the list of [imported datasets](./data-sources.md),
32 | - the [IYP acknowledgments](../ACKNOWLEDGMENTS.md) file should list the license of all imported dataset.
33 | 
34 | Changes to the ontology should be discussed in advance, either on [GitHub
35 | discussion](https://github.com/InternetHealthReport/internet-yellow-pages/discussions)
36 | or by reaching out to [IYP maintainers](mailto:iyp@ihr.live), so that a consensus is reached
37 | before the ontology is updated.
38 | **Any change to the ontology should be reflected in the documentation:** ([Node
39 | types](./node-types.md) and [Relationship types](./relationship-types.md)).
40 | 
41 | You can also consider adding example queries to the [IYP gallery](./gallery.md), and
42 | organizations providing data to the [IYP frontpage](https://iyp.iijlab.net/).
43 | 


--------------------------------------------------------------------------------
/iyp/crawlers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InternetHealthReport/internet-yellow-pages/c79ced8783c73f282d273a886f4aa5cf8018fdd9/iyp/crawlers/__init__.py


--------------------------------------------------------------------------------
/iyp/crawlers/alice_lg/README.md:
--------------------------------------------------------------------------------
 1 | # Alice-LG -- https://github.com/alice-lg/alice-lg
 2 | 
 3 | Alice-LG is a BGP looking glass which gets its data from external APIs.
 4 | 
 5 | It is used by some large IXPs (e.g., DE-CIX, LINX, AMS-IX) and IYP imports membership
 6 | information by reading the route server neighbors.
 7 | 
 8 | The crawler *can* also import the received routes of all neighbors, however testing has
 9 | shown that this takes an unreasonable amount of time for most IXPs due to the tiny
10 | pagination size (250 routes per page). Therefore this functionality is disabled by default.
11 | 
12 | List of supported IXPs:
13 | 
14 | - AMS-IX (`amsix.py`)
15 | - BCIX (`bcix.py`)
16 | - DE-CIX (`decix.py`)
17 | - IX.br (`ixbr.py`)
18 | - LINX (`linx.py`)
19 | - Megaport (`megaport.py`)
20 | - Netnod (`netnod.py`)
21 | 
22 | ## Graph representation
23 | 
24 | ```Cypher
25 | (:AS {asn: 2497})-[:MEMBER_OF {address: '80.81.193.136', routeserver_id: 'rs1_fra_ipv4'}]->(:IXP {name: 'DE-CIX Frankfurt'})
26 | // Routes are not crawled by default
27 | (:AS {asn: 3333})-[:ORIGINATE {neighbor_id: 'pb_0280_as20562', routeserver_id: 'rs01-bcix-v4'}]->(:BGPPrefix {prefix: '193.0.0.0/21'})
28 | ```
29 | 
30 | There is the possibility of multiple relationships between the same node. However, these
31 | contain different information, e.g., a member is present with multiple interfaces
32 | (`address`) or the information is seen by different route servers (`routeserver_id`).
33 | Similarly, a route can be seen via multiple neighbors (`neighbor_id`) or different route
34 | servers (`routeserver_id`).
35 | 
36 | ## Dependence
37 | 
38 | This crawler requires peering LAN information to map the neighbor IP to an IXP.
39 | Therefore, it should be run after crawlers that create
40 | 
41 | ```Cypher
42 | (:PeeringLAN)-[:MANAGED_BY]->(:IXP)
43 | ```
44 | 
45 | relationships:
46 | 
47 | - `iyp.crawlers.peeringdb.ix`
48 | 


--------------------------------------------------------------------------------
/iyp/crawlers/alice_lg/amsix.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.alice_lg import Crawler
 6 | 
 7 | ORG = 'Alice-LG'
 8 | URL = 'https://lg.ams-ix.net/api/v1/'
 9 | NAME = 'alice_lg.amsix'
10 | 
11 | 
12 | def main() -> None:
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('--unit-test', action='store_true')
15 |     args = parser.parse_args()
16 | 
17 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
18 |     logging.basicConfig(
19 |         format=FORMAT,
20 |         filename='log/' + NAME + '.log',
21 |         level=logging.INFO,
22 |         datefmt='%Y-%m-%d %H:%M:%S'
23 |     )
24 | 
25 |     logging.info(f'Started: {sys.argv}')
26 | 
27 |     crawler = Crawler(ORG, URL, NAME)
28 |     if args.unit_test:
29 |         crawler.unit_test()
30 |     else:
31 |         crawler.run()
32 |         crawler.close()
33 |     logging.info(f'Finished: {sys.argv}')
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 |     sys.exit(0)
39 | 


--------------------------------------------------------------------------------
/iyp/crawlers/alice_lg/bcix.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.alice_lg import Crawler
 6 | 
 7 | ORG = 'Alice-LG'
 8 | URL = 'https://lg.bcix.de/api/v1/'
 9 | NAME = 'alice_lg.bcix'
10 | 
11 | 
12 | def main() -> None:
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('--unit-test', action='store_true')
15 |     args = parser.parse_args()
16 | 
17 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
18 |     logging.basicConfig(
19 |         format=FORMAT,
20 |         filename='log/' + NAME + '.log',
21 |         level=logging.INFO,
22 |         datefmt='%Y-%m-%d %H:%M:%S'
23 |     )
24 | 
25 |     logging.info(f'Started: {sys.argv}')
26 | 
27 |     crawler = Crawler(ORG, URL, NAME)
28 |     if args.unit_test:
29 |         crawler.unit_test()
30 |     else:
31 |         crawler.run()
32 |         crawler.close()
33 |     logging.info(f'Finished: {sys.argv}')
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 |     sys.exit(0)
39 | 


--------------------------------------------------------------------------------
/iyp/crawlers/alice_lg/decix.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.alice_lg import Crawler
 6 | 
 7 | ORG = 'Alice-LG'
 8 | URL = 'https://lg.de-cix.net/api/v1/'
 9 | NAME = 'alice_lg.decix'
10 | 
11 | 
12 | def main() -> None:
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('--unit-test', action='store_true')
15 |     args = parser.parse_args()
16 | 
17 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
18 |     logging.basicConfig(
19 |         format=FORMAT,
20 |         filename='log/' + NAME + '.log',
21 |         level=logging.INFO,
22 |         datefmt='%Y-%m-%d %H:%M:%S'
23 |     )
24 | 
25 |     logging.info(f'Started: {sys.argv}')
26 | 
27 |     crawler = Crawler(ORG, URL, NAME)
28 |     if args.unit_test:
29 |         crawler.unit_test()
30 |     else:
31 |         crawler.run()
32 |         crawler.close()
33 |     logging.info(f'Finished: {sys.argv}')
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 |     sys.exit(0)
39 | 


--------------------------------------------------------------------------------
/iyp/crawlers/alice_lg/ixbr.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.alice_lg import Crawler
 6 | 
 7 | ORG = 'Alice-LG'
 8 | URL = 'https://lg.ix.br/api/v1/'
 9 | NAME = 'alice_lg.ixbr'
10 | 
11 | 
12 | def main() -> None:
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('--unit-test', action='store_true')
15 |     args = parser.parse_args()
16 | 
17 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
18 |     logging.basicConfig(
19 |         format=FORMAT,
20 |         filename='log/' + NAME + '.log',
21 |         level=logging.INFO,
22 |         datefmt='%Y-%m-%d %H:%M:%S'
23 |     )
24 | 
25 |     logging.info(f'Started: {sys.argv}')
26 | 
27 |     crawler = Crawler(ORG, URL, NAME)
28 |     if args.unit_test:
29 |         crawler.unit_test()
30 |     else:
31 |         crawler.run()
32 |         crawler.close()
33 |     logging.info(f'Finished: {sys.argv}')
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 |     sys.exit(0)
39 | 


--------------------------------------------------------------------------------
/iyp/crawlers/alice_lg/linx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.alice_lg import Crawler
 6 | 
 7 | ORG = 'Alice-LG'
 8 | URL = 'https://alice-rs.linx.net/api/v1/'
 9 | NAME = 'alice_lg.linx'
10 | 
11 | 
12 | def main() -> None:
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('--unit-test', action='store_true')
15 |     args = parser.parse_args()
16 | 
17 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
18 |     logging.basicConfig(
19 |         format=FORMAT,
20 |         filename='log/' + NAME + '.log',
21 |         level=logging.INFO,
22 |         datefmt='%Y-%m-%d %H:%M:%S'
23 |     )
24 | 
25 |     logging.info(f'Started: {sys.argv}')
26 | 
27 |     crawler = Crawler(ORG, URL, NAME)
28 |     if args.unit_test:
29 |         crawler.unit_test()
30 |     else:
31 |         crawler.run()
32 |         crawler.close()
33 |     logging.info(f'Finished: {sys.argv}')
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 |     sys.exit(0)
39 | 


--------------------------------------------------------------------------------
/iyp/crawlers/alice_lg/megaport.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.alice_lg import Crawler
 6 | 
 7 | ORG = 'Alice-LG'
 8 | URL = 'https://lg.megaport.com/api/v1/'
 9 | NAME = 'alice_lg.megaport'
10 | 
11 | 
12 | def main() -> None:
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('--unit-test', action='store_true')
15 |     args = parser.parse_args()
16 | 
17 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
18 |     logging.basicConfig(
19 |         format=FORMAT,
20 |         filename='log/' + NAME + '.log',
21 |         level=logging.INFO,
22 |         datefmt='%Y-%m-%d %H:%M:%S'
23 |     )
24 | 
25 |     logging.info(f'Started: {sys.argv}')
26 | 
27 |     crawler = Crawler(ORG, URL, NAME)
28 |     if args.unit_test:
29 |         crawler.unit_test()
30 |     else:
31 |         crawler.run()
32 |         crawler.close()
33 |     logging.info(f'Finished: {sys.argv}')
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 |     sys.exit(0)
39 | 


--------------------------------------------------------------------------------
/iyp/crawlers/alice_lg/netnod.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.alice_lg import Crawler
 6 | 
 7 | ORG = 'Alice-LG'
 8 | URL = 'https://lg.netnod.se/api/v1/'
 9 | NAME = 'alice_lg.netnod'
10 | 
11 | 
12 | def main() -> None:
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('--unit-test', action='store_true')
15 |     args = parser.parse_args()
16 | 
17 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
18 |     logging.basicConfig(
19 |         format=FORMAT,
20 |         filename='log/' + NAME + '.log',
21 |         level=logging.INFO,
22 |         datefmt='%Y-%m-%d %H:%M:%S'
23 |     )
24 | 
25 |     logging.info(f'Started: {sys.argv}')
26 | 
27 |     crawler = Crawler(ORG, URL, NAME)
28 |     if args.unit_test:
29 |         crawler.unit_test()
30 |     else:
31 |         crawler.run()
32 |         crawler.close()
33 |     logging.info(f'Finished: {sys.argv}')
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 |     sys.exit(0)
39 | 


--------------------------------------------------------------------------------
/iyp/crawlers/apnic/README.md:
--------------------------------------------------------------------------------
 1 | # APNIC population estimates -- https://labs.apnic.net/
 2 | 
 3 | Population per AS estimated using an advertisement on Youtube. This dataset 
 4 | provides:
 5 | - the percentage of the population per country/AS
 6 | - AS ranking in terms of population
 7 | - AS names
 8 | 
 9 | 
10 | ## Graph representation
11 | 
12 | ### Population
13 | Connect AS to country nodes with a 'population' relationship representing the
14 | percentage of the country's population hosted by the AS.
15 | 
16 | ```
17 | (:AS {asn:2516})-[:POPULATION {percent:19.3}]-(:Country {country_code:'JP'})
18 | ```
19 | 
20 | 
21 | ### Country
22 | Connect AS to country nodes, meaning that the AS serves people in that country.
23 | 
24 | ```
25 | (:AS)-[:COUNTRY]-(:Country)
26 | ```
27 | 
28 | ### Ranking
29 | Connect ASes to ranking nodes which are also connected to a country. Meaning 
30 | that an AS is ranked for a certain country in terms of population.
31 | For example:
32 | ```
33 | (:AS  {asn:2516})-[:RANK {rank:1}]-(:Ranking)--(:Country {country_code:'JP'})
34 | ```
35 | 
36 | ### AS name
37 | Connect AS to names nodes, providing the name of ranked ASes. 
38 | For example:
39 | ```
40 | (:AS {asn:2497})-[:NAME]-(:Name {name:'IIJ'})
41 | ```
42 | 
43 | 
44 | ## Dependence
45 | 
46 | This crawler is not depending on other crawlers.
47 | 


--------------------------------------------------------------------------------
/iyp/crawlers/bgpkit/README.md:
--------------------------------------------------------------------------------
 1 | # BGPKIT data -- https://data.bgpkit.com
 2 | 
 3 | Data inferred from RouteViews and RIPE RIS BGP data, including:
 4 | - AS relationship
 5 | - prefix to ASN mappings
 6 | - BGP collectors' peers stats
 7 | 
 8 | 
 9 | ## Graph representation
10 | 
11 | ### AS relationship
12 | Connect ASes that are peering with each other. The 'rel' attribute and the link
13 | direction gives the type of relationship between the two ASes:
14 | - rel=0: peer to peer relationship
15 | - rel=1: provider/customer relationship. A->B means A is the provider of B.
16 | 
17 | ```
18 | (:AS {asn:2497})-[:PEERS_WITH {rel: 0, af: 4}]-(:AS {asn:2914})
19 | ```
20 | 
21 | 
22 | ### Peers stats
23 | Connect AS nodes to BGP route collector nodes, meaning that an AS peers with
24 | a route collector hence participating in the RIS or RouteViews projects.
25 | 
26 | ```
27 | (:AS {asn:2497})-[:PEERS_WITH]-(:BGPCollector {project: 'riperis', name:'rrc06'})
28 | ```
29 | 
30 | ### Prefix to ASN
31 | Connect AS nodes to prefix nodes representing the prefixes originated by an AS.
32 | For example:
33 | ```
34 | (:AS  {asn:2497})-[:ORIGINATE]-(:BGPPrefix {prefix: '101.128.128.0/17'})
35 | ```
36 | 
37 | ## Dependence
38 | 
39 | This crawler is not depending on other crawlers.
40 | 


--------------------------------------------------------------------------------
/iyp/crawlers/bgpkit/__init__.py:
--------------------------------------------------------------------------------
 1 | import bz2
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from iyp import BaseCrawler, set_modification_time_from_last_modified_header
 7 | 
 8 | 
 9 | class AS2RelCrawler(BaseCrawler):
10 |     def __init__(self, organization, url, name, af):
11 |         """Initialization: set the address family attribute (af)"""
12 |         super().__init__(organization, url, name)
13 |         self.af = af
14 |         self.reference['reference_url_info'] = 'https://data.bgpkit.com/as2rel/README.txt'
15 | 
16 |     def run(self):
17 |         """Fetch the AS relationship file from BGPKIT website and process lines one by
18 |         one."""
19 | 
20 |         req = requests.get(self.url, stream=True)
21 |         req.raise_for_status()
22 | 
23 |         set_modification_time_from_last_modified_header(self.reference, req)
24 | 
25 |         rels = []
26 |         asns = set()
27 | 
28 |         # Collect all ASNs
29 |         for rel in json.load(bz2.open(req.raw)):
30 |             asns.add(rel['asn1'])
31 |             asns.add(rel['asn2'])
32 |             rels.append(rel)
33 | 
34 |         # get ASNs IDs
35 |         self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
36 | 
37 |         # Compute links
38 |         links = []
39 |         for rel in rels:
40 |             as1_qid = self.asn_id[rel['asn1']]
41 |             as2_qid = self.asn_id[rel['asn2']]
42 |             rel['af'] = self.af
43 | 
44 |             links.append({'src_id': as1_qid, 'dst_id': as2_qid, 'props': [self.reference, rel]})
45 | 
46 |         # Push all links to IYP
47 |         self.iyp.batch_add_links('PEERS_WITH', links)
48 | 
49 |     def unit_test(self):
50 |         return super().unit_test(['PEERS_WITH'])
51 | 


--------------------------------------------------------------------------------
/iyp/crawlers/bgpkit/as2rel_v4.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.bgpkit import AS2RelCrawler
 6 | 
 7 | URL = 'https://data.bgpkit.com/as2rel/as2rel-v4-latest.json.bz2'
 8 | ORG = 'BGPKIT'
 9 | NAME = 'bgpkit.as2rel_v4'
10 | 
11 | AF = 4
12 | 
13 | 
14 | class Crawler(AS2RelCrawler):
15 |     def __init__(self, organization, url, name):
16 |         super().__init__(organization, url, name, AF)
17 | 
18 | 
19 | def main() -> None:
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('--unit-test', action='store_true')
22 |     args = parser.parse_args()
23 | 
24 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
25 |     logging.basicConfig(
26 |         format=FORMAT,
27 |         filename='log/' + NAME + '.log',
28 |         level=logging.INFO,
29 |         datefmt='%Y-%m-%d %H:%M:%S'
30 |     )
31 | 
32 |     logging.info(f'Started: {sys.argv}')
33 | 
34 |     crawler = Crawler(ORG, URL, NAME)
35 |     if args.unit_test:
36 |         crawler.unit_test()
37 |     else:
38 |         crawler.run()
39 |         crawler.close()
40 |     logging.info(f'Finished: {sys.argv}')
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 |     sys.exit(0)
46 | 


--------------------------------------------------------------------------------
/iyp/crawlers/bgpkit/as2rel_v6.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.bgpkit import AS2RelCrawler
 6 | 
 7 | URL = 'https://data.bgpkit.com/as2rel/as2rel-v6-latest.json.bz2'
 8 | ORG = 'BGPKIT'
 9 | NAME = 'bgpkit.as2rel_v6'
10 | 
11 | AF = 6
12 | 
13 | 
14 | class Crawler(AS2RelCrawler):
15 |     def __init__(self, organization, url, name):
16 |         super().__init__(organization, url, name, AF)
17 | 
18 | 
19 | def main() -> None:
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('--unit-test', action='store_true')
22 |     args = parser.parse_args()
23 | 
24 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
25 |     logging.basicConfig(
26 |         format=FORMAT,
27 |         filename='log/' + NAME + '.log',
28 |         level=logging.INFO,
29 |         datefmt='%Y-%m-%d %H:%M:%S'
30 |     )
31 | 
32 |     logging.info(f'Started: {sys.argv}')
33 | 
34 |     crawler = Crawler(ORG, URL, NAME)
35 |     if args.unit_test:
36 |         crawler.unit_test()
37 |     else:
38 |         crawler.run()
39 |         crawler.close()
40 |     logging.info(f'Finished: {sys.argv}')
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 |     sys.exit(0)
46 | 


--------------------------------------------------------------------------------
/iyp/crawlers/bgpkit/peerstats.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import bz2
  3 | import json
  4 | import logging
  5 | import sys
  6 | from datetime import datetime, time, timedelta, timezone
  7 | 
  8 | import requests
  9 | 
 10 | from iyp import BaseCrawler
 11 | 
 12 | MAIN_PAGE = 'https://data.bgpkit.com/peer-stats/'
 13 | URL = 'https://data.bgpkit.com/peer-stats/{collector}/{year}/{month:02d}/peer-stats_{collector}_{year}-{month:02d}-{day:02d}_{epoch}.bz2'  # noqa: E501
 14 | ORG = 'BGPKIT'
 15 | NAME = 'bgpkit.peerstats'
 16 | 
 17 | 
 18 | class Crawler(BaseCrawler):
 19 |     def __init__(self, organization, url, name):
 20 |         super().__init__(organization, url, name)
 21 |         self.reference['reference_url_info'] = 'https://data.bgpkit.com/peer-stats/README.md'
 22 | 
 23 |     def run(self):
 24 |         """Fetch peer stats for each collector."""
 25 | 
 26 |         req = requests.get(MAIN_PAGE)
 27 |         req.raise_for_status()
 28 | 
 29 |         # Find all collectors
 30 |         collectors = []
 31 |         for line in req.text.splitlines():
 32 |             if line.strip().startswith('<span class="name">') and line.endswith('/</span>'):
 33 |                 collectors.append(line.partition('>')[2].partition('/')[0])
 34 | 
 35 |         # Find latest date
 36 |         prev_day = datetime.combine(datetime.utcnow(), time.min, timezone.utc)
 37 |         self.now = None
 38 |         req = None
 39 |         trials = 0
 40 | 
 41 |         while (req is None or req.status_code != 200) and trials < 7:
 42 |             self.now = prev_day
 43 |             # Check if today's data is available
 44 |             url = URL.format(collector='rrc10', year=self.now.year,
 45 |                              month=self.now.month, day=self.now.day,
 46 |                              epoch=int(self.now.timestamp()))
 47 |             req = requests.head(url)
 48 | 
 49 |             prev_day -= timedelta(days=1)
 50 |             logging.warning("Today's data not yet available!")
 51 | 
 52 |         self.reference['reference_time_modification'] = self.now
 53 |         for collector in collectors:
 54 |             url = URL.format(collector=collector, year=self.now.year,
 55 |                              month=self.now.month, day=self.now.day,
 56 |                              epoch=int(self.now.timestamp()))
 57 | 
 58 |             req = requests.get(url, stream=True)
 59 |             if req.status_code != 200:
 60 |                 logging.warning(f'Data not available for {collector}')
 61 |                 continue
 62 | 
 63 |             # keep track of collector and reference url
 64 |             stats = json.load(bz2.open(req.raw))
 65 |             collector_qid = self.iyp.get_node(
 66 |                 'BGPCollector',
 67 |                 {'name': stats['collector'], 'project': stats['project']}
 68 |             )
 69 |             self.reference['reference_url_data'] = url
 70 | 
 71 |             asns = set()
 72 | 
 73 |             # Collect all ASNs and names
 74 |             for peer in stats['peers'].values():
 75 |                 asns.add(peer['asn'])
 76 | 
 77 |             # get ASNs' IDs
 78 |             self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
 79 | 
 80 |             # Compute links
 81 |             links = []
 82 |             for peer in stats['peers'].values():
 83 |                 as_qid = self.asn_id[peer['asn']]
 84 |                 links.append({'src_id': as_qid, 'dst_id': collector_qid,
 85 |                              'props': [self.reference, peer]})  # Set AS name
 86 | 
 87 |             # Push all links to IYP
 88 |             self.iyp.batch_add_links('PEERS_WITH', links)
 89 | 
 90 |     def unit_test(self):
 91 |         return super().unit_test(['PEERS_WITH'])
 92 | 
 93 | 
 94 | def main() -> None:
 95 |     parser = argparse.ArgumentParser()
 96 |     parser.add_argument('--unit-test', action='store_true')
 97 |     args = parser.parse_args()
 98 | 
 99 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
100 |     logging.basicConfig(
101 |         format=FORMAT,
102 |         filename='log/' + NAME + '.log',
103 |         level=logging.INFO,
104 |         datefmt='%Y-%m-%d %H:%M:%S'
105 |     )
106 | 
107 |     logging.info(f'Started: {sys.argv}')
108 | 
109 |     crawler = Crawler(ORG, URL, NAME)
110 |     if args.unit_test:
111 |         crawler.unit_test()
112 |     else:
113 |         crawler.run()
114 |         crawler.close()
115 |     logging.info(f'Finished: {sys.argv}')
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     main()
120 |     sys.exit(0)
121 | 


--------------------------------------------------------------------------------
/iyp/crawlers/bgpkit/pfx2asn.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import bz2
 3 | import json
 4 | import logging
 5 | import sys
 6 | from ipaddress import ip_network
 7 | 
 8 | import requests
 9 | 
10 | from iyp import BaseCrawler, set_modification_time_from_last_modified_header
11 | 
12 | URL = 'https://data.bgpkit.com/pfx2as/pfx2as-latest.json.bz2'
13 | ORG = 'BGPKIT'
14 | NAME = 'bgpkit.pfx2asn'
15 | 
16 | 
17 | class Crawler(BaseCrawler):
18 | 
19 |     def run(self):
20 |         """Fetch the prefix to ASN file from BGPKIT website and process lines one by
21 |         one."""
22 | 
23 |         req = requests.get(URL, stream=True)
24 |         req.raise_for_status()
25 | 
26 |         set_modification_time_from_last_modified_header(self.reference, req)
27 | 
28 |         entries = []
29 |         asns = set()
30 |         prefixes = set()
31 | 
32 |         for entry in json.load(bz2.open(req.raw)):
33 |             try:
34 |                 prefix = ip_network(entry['prefix']).compressed
35 |             except ValueError as e:
36 |                 logging.warning(f'Ignoring malformed prefix: "{entry["prefix"]}": {e}')
37 |                 continue
38 |             entry['prefix'] = prefix
39 |             prefixes.add(prefix)
40 |             asns.add(entry['asn'])
41 |             entries.append(entry)
42 | 
43 |         req.close()
44 | 
45 |         # get ASNs and prefixes IDs
46 |         self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
47 |         self.prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes, all=False)
48 |         # Add the BGPPrefix label
49 |         self.iyp.batch_add_node_label(list(self.prefix_id.values()), 'BGPPrefix')
50 | 
51 |         # Compute links
52 |         links = []
53 |         for entry in entries:
54 |             asn_qid = self.asn_id[entry['asn']]
55 |             prefix_qid = self.prefix_id[entry['prefix']]
56 | 
57 |             links.append({'src_id': asn_qid, 'dst_id': prefix_qid, 'props': [self.reference, entry]})
58 | 
59 |         # Push all links to IYP
60 |         self.iyp.batch_add_links('ORIGINATE', links)
61 | 
62 |     def unit_test(self):
63 |         return super().unit_test(['ORIGINATE'])
64 | 
65 | 
66 | def main() -> None:
67 |     parser = argparse.ArgumentParser()
68 |     parser.add_argument('--unit-test', action='store_true')
69 |     args = parser.parse_args()
70 | 
71 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
72 |     logging.basicConfig(
73 |         format=FORMAT,
74 |         filename='log/' + NAME + '.log',
75 |         level=logging.INFO,
76 |         datefmt='%Y-%m-%d %H:%M:%S'
77 |     )
78 | 
79 |     logging.info(f'Started: {sys.argv}')
80 | 
81 |     crawler = Crawler(ORG, URL, NAME)
82 |     if args.unit_test:
83 |         crawler.unit_test()
84 |     else:
85 |         crawler.run()
86 |         crawler.close()
87 |     logging.info(f'Finished: {sys.argv}')
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     main()
92 |     sys.exit(0)
93 | 


--------------------------------------------------------------------------------
/iyp/crawlers/bgptools/README.md:
--------------------------------------------------------------------------------
 1 | # BGP.Tools -- https://bgp.tools
 2 | 
 3 | Data collected by BGP.Tools, including:
 4 | - AS names
 5 | - AS tags
 6 | - Anycast IPv4, and IPv6 prefixes
 7 | 
 8 | 
 9 | ## Graph representation
10 | 
11 | ### AS names
12 | Connect AS to names nodes, providing the name of an AS.
13 | For example:
14 | ```
15 | (:AS {asn:2497})-[:NAME]-(:Name {name:'IIJ'})
16 | ```
17 | 
18 | ### AS tags
19 | Connect AS to tag nodes meaning that an AS has been categorized according to the
20 | given tag.
21 | ```
22 | (:AS {asn:2497})-[:CATEGORIZED]-(:Tag {label: 'Internet Critical Infra'})
23 | ```
24 | 
25 | ### Anycast IPv4 and IPv6 prefixes
26 | Connect Prefix to tag node meaning that an prefix has been categorized according to the TAG with a label `Anycast`.
27 | ```
28 | (:BGPPrefix {prefix: '43.249.213.0/24'})-[:CATEGORIZED]-(:Tag {label: 'Anycast'})
29 | ```
30 | 
31 | ## Dependence
32 | 
33 | This crawler is not depending on other crawlers.
34 | 


--------------------------------------------------------------------------------
/iyp/crawlers/bgptools/as_names.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import sys
  4 | from io import BytesIO
  5 | 
  6 | import pandas as pd
  7 | import requests
  8 | 
  9 | from iyp import BaseCrawler
 10 | 
 11 | URL = 'https://bgp.tools/asns.csv'
 12 | ORG = 'BGP.Tools'
 13 | NAME = 'bgptools.as_names'
 14 | 
 15 | 
 16 | class Crawler(BaseCrawler):
 17 |     def __init__(self, organization, url, name):
 18 |         super().__init__(organization, url, name)
 19 |         self.reference['reference_url_info'] = 'https://bgp.tools/kb/api'
 20 | 
 21 |         self.headers = {
 22 |             'user-agent': 'IIJ/Internet Health Report - admin@ihr.live'
 23 |         }
 24 | 
 25 |     @staticmethod
 26 |     def replace_link_ids(links: list, src_id: dict = dict(), dst_id=dict()):
 27 |         """Replace the src_id and dst_id values from links with their actual id."""
 28 |         for link in links:
 29 |             if src_id:
 30 |                 link['src_id'] = src_id[link['src_id']]
 31 |             if dst_id:
 32 |                 link['dst_id'] = dst_id[link['dst_id']]
 33 | 
 34 |     def run(self):
 35 |         """Fetch the AS name file from BGP.Tools website and push it to IYP."""
 36 | 
 37 |         req = requests.get(URL, headers=self.headers)
 38 |         req.raise_for_status()
 39 | 
 40 |         df = pd.read_csv(BytesIO(req.content), keep_default_na=False)
 41 | 
 42 |         asns = set()
 43 |         names = set()
 44 |         tags = set()
 45 |         name_links = list()
 46 |         tag_links = list()
 47 | 
 48 |         # Normally we would use itertuples, since it is way faster. But we want to be
 49 |         # robust against format changes and since one column is called "class", which is
 50 |         # a Python keyword, the field name would be replaced by a positional value,
 51 |         # e.g., r._3 instead of r.class, which means that if the format is changed, this
 52 |         # crawler breaks again.
 53 |         # Since the data set is not too large, iterrows is fine performance-wise.
 54 |         for r in df.iterrows():
 55 |             has_link = False
 56 |             entry = r[1]
 57 |             asn = entry['asn']
 58 |             if not asn.startswith('AS'):
 59 |                 logging.warning(f'asn field does not start with "AS": {entry}')
 60 |                 continue
 61 |             asn = int(asn[2:])
 62 |             name = entry['name']
 63 |             if name != 'ERR_AS_NAME_NOT_FOUND':
 64 |                 names.add(name)
 65 |                 name_links.append({'src_id': asn, 'dst_id': name, 'props': [self.reference]})
 66 |                 has_link = True
 67 |             tag = entry['class']
 68 |             if tag != 'Unknown':
 69 |                 tags.add(tag)
 70 |                 tag_links.append({'src_id': asn, 'dst_id': tag, 'props': [self.reference]})
 71 |                 has_link = True
 72 |             if has_link:
 73 |                 # Only create AS nodes if we have a relationship.
 74 |                 asns.add(asn)
 75 | 
 76 |         asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
 77 |         name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names, all=False)
 78 |         tag_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', tags, all=False)
 79 | 
 80 |         self.replace_link_ids(name_links, asn_id, name_id)
 81 |         self.replace_link_ids(tag_links, asn_id, tag_id)
 82 | 
 83 |         self.iyp.batch_add_links('NAME', name_links)
 84 |         self.iyp.batch_add_links('CATEGORIZED', tag_links)
 85 | 
 86 |     def unit_test(self):
 87 |         return super().unit_test(['NAME'])
 88 | 
 89 | 
 90 | def main() -> None:
 91 |     parser = argparse.ArgumentParser()
 92 |     parser.add_argument('--unit-test', action='store_true')
 93 |     args = parser.parse_args()
 94 | 
 95 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
 96 |     logging.basicConfig(
 97 |         format=FORMAT,
 98 |         filename='log/' + NAME + '.log',
 99 |         level=logging.INFO,
100 |         datefmt='%Y-%m-%d %H:%M:%S'
101 |     )
102 | 
103 |     logging.info(f'Started: {sys.argv}')
104 | 
105 |     crawler = Crawler(ORG, URL, NAME)
106 |     if args.unit_test:
107 |         crawler.unit_test()
108 |     else:
109 |         crawler.run()
110 |         crawler.close()
111 |     logging.info(f'Finished: {sys.argv}')
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     main()
116 |     sys.exit(0)
117 | 


--------------------------------------------------------------------------------
/iyp/crawlers/bgptools/tags.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import sys
  4 | from datetime import datetime, time, timezone
  5 | 
  6 | import requests
  7 | 
  8 | from iyp import BaseCrawler
  9 | 
 10 | # curl -s https://bgp.tools/asns.csv | head -n 5
 11 | URL = 'https://bgp.tools/tags/'
 12 | ORG = 'BGP.Tools'
 13 | NAME = 'bgptools.tags'
 14 | 
 15 | TAGS = {
 16 |     'cdn': 'Content Delivery Network',
 17 |     'dsl': 'Home ISP',
 18 |     'a10k': 'Tranco 10k Host',
 19 |     'icrit': 'Internet Critical Infra',
 20 |     'tor': 'ToR Services',
 21 |     'anycast': 'Anycast',
 22 |     'perso': 'Personal ASN',
 23 |     'ddosm': 'DDoS Mitigation',
 24 |     'vpn': 'VPN Host',
 25 |     'vpsh': 'Server Hosting',
 26 |     'uni': 'Academic',
 27 |     'gov': 'Government',
 28 |     'event': 'Event',
 29 |     'mobile': 'Mobile Data/Carrier',
 30 |     'satnet': 'Satellite Internet',
 31 |     'biznet': 'Business Broadband',
 32 |     'corp': 'Corporate/Enterprise',
 33 |     'rpkirov': 'Validating RPKI ROV'
 34 | }
 35 | 
 36 | 
 37 | class Crawler(BaseCrawler):
 38 |     def __init__(self, organization, url, name):
 39 |         super().__init__(organization, url, name)
 40 |         self.reference['reference_url_info'] = 'https://bgp.tools/kb/api'
 41 | 
 42 |         self.headers = {
 43 |             'user-agent': 'IIJ/Internet Health Report - admin@ihr.live'
 44 |         }
 45 | 
 46 |     def run(self):
 47 |         """Fetch the AS name file from BGP.Tools website and process lines one by
 48 |         one."""
 49 | 
 50 |         for tag, label in TAGS.items():
 51 |             url = URL + tag + '.csv'
 52 |             # Reference information for data pushed to the wikibase
 53 |             self.reference = {
 54 |                 'reference_org': ORG,
 55 |                 'reference_url_data': url,
 56 |                 'reference_name': NAME,
 57 |                 'reference_time_fetch': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
 58 |             }
 59 | 
 60 |             req = requests.get(url, headers=self.headers)
 61 |             req.raise_for_status()
 62 | 
 63 |             self.tag_qid = self.iyp.get_node('Tag', {'label': label})
 64 |             for line in req.text.splitlines():
 65 |                 # skip header
 66 |                 if line.startswith('asn'):
 67 |                     continue
 68 | 
 69 |                 # Parse given line to get ASN, name, and country code
 70 |                 asn, _, _ = line.partition(',')
 71 |                 asn_qid = self.iyp.get_node('AS', {'asn': asn[2:]})
 72 |                 statements = [['CATEGORIZED', self.tag_qid, self.reference]]  # Set AS name
 73 | 
 74 |                 # Update AS name and country
 75 |                 self.iyp.add_links(asn_qid, statements)
 76 | 
 77 |     def unit_test(self):
 78 |         return super().unit_test(['CATEGORIZED'])
 79 | 
 80 | 
 81 | def main() -> None:
 82 |     parser = argparse.ArgumentParser()
 83 |     parser.add_argument('--unit-test', action='store_true')
 84 |     args = parser.parse_args()
 85 | 
 86 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
 87 |     logging.basicConfig(
 88 |         format=FORMAT,
 89 |         filename='log/' + NAME + '.log',
 90 |         level=logging.INFO,
 91 |         datefmt='%Y-%m-%d %H:%M:%S'
 92 |     )
 93 | 
 94 |     logging.info(f'Started: {sys.argv}')
 95 | 
 96 |     crawler = Crawler(ORG, URL, NAME)
 97 |     if args.unit_test:
 98 |         crawler.unit_test()
 99 |     else:
100 |         crawler.run()
101 |         crawler.close()
102 |     logging.info(f'Finished: {sys.argv}')
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     main()
107 |     sys.exit(0)
108 | 


--------------------------------------------------------------------------------
/iyp/crawlers/caida/README.md:
--------------------------------------------------------------------------------
  1 | # CAIDA -- https://caida.org
  2 | 
  3 | ## ASRank (asrank.py)
  4 | 
  5 | AS rank in terms of customer cone size, meaning that large transit providers are
  6 | higher ranked.
  7 | 
  8 | ### Graph representation
  9 | 
 10 | Ranking:
 11 | 
 12 | Connect ASes nodes to a single ranking node corresponding to ASRank. The rank is
 13 | given as a link attribute.
 14 | For example:
 15 | 
 16 | ```cypher
 17 | (:AS  {asn:2497})-[:RANK {rank:87}]-(:Ranking {name:'CAIDA ASRank'})
 18 | ```
 19 | 
 20 | Country:
 21 | 
 22 | Connect AS to country nodes, meaning that the AS is registered in that country.
 23 | 
 24 | ```cypher
 25 | (:AS)-[:COUNTRY]-(:Country)
 26 | ```
 27 | 
 28 | AS name:
 29 | 
 30 | Connect AS to names nodes, providing the name of an AS.
 31 | For example:
 32 | 
 33 | ```cypher
 34 | (:AS {asn:2497})-[:NAME]-(:Name {name:'IIJ'})
 35 | ```
 36 | 
 37 | ### Dependence
 38 | 
 39 | The asrank crawler is not depending on other crawlers.
 40 | 
 41 | ## IXPs (ixs.py)
 42 | 
 43 | List of IXPs obtained from PeeringDB, Hurricane Electric, Packet Clearing House.
 44 | 
 45 | ### Graph representation
 46 | 
 47 | Nodes:
 48 | 
 49 | - `(:IXP {name})`: IXP node
 50 | - `(:Name {name})`: Name of IXP
 51 | - `(:PeeringLAN {prefix})`: Prefix of IXP peering LAN
 52 | - `(:CaidaIXID {id})`: ID of the IXP assigned by CAIDA
 53 | - `(:Country {country_code})`: Country code
 54 | - `(:URL {url})`: Website of IXP
 55 | 
 56 | Relationships:
 57 | 
 58 | ```Cypher
 59 | (:IXP)-[:COUNTRY]->(:Country)
 60 | (:IXP)-[:EXTERNAL_ID]->(:CaidaIXID)
 61 | (:IXP)-[:NAME]->(:Name)
 62 | (:IXP)-[:WEBSITE]->(:URL)
 63 | (:PeeringLAN)-[:MANAGED_BY]->(:IXP)
 64 | ```
 65 | 
 66 | ### Dependence
 67 | 
 68 | The ixs crawler depends on the peeringdb.ix crawler.
 69 | 
 70 | ## IXP memberships (ix_asns.py)
 71 | 
 72 | List of ASes present at each IXP.
 73 | 
 74 | ### Graph representation
 75 | 
 76 | Relationships:
 77 | 
 78 | ```cypher
 79 | (:AS)-[:MEMBER_OF]->(:IXP)
 80 | ```
 81 | 
 82 | ### Dependence
 83 | 
 84 | The ix_asns crawler depends on the ixs crawler.
 85 | 
 86 | ## AS relationships (as_relationships_v[4|6].py)
 87 | 
 88 | Inferred AS relationships (peer-to-peer or customer-provider).
 89 | 
 90 | ### Graph representation
 91 | 
 92 | ```cypher
 93 | (:AS {asn: 2497})-[r:PEERS_WITH {af: 4, rel: -1}]->(:AS {asn: 7500})
 94 | ```
 95 | 
 96 | Either the `reference_name` or `af` properties can be used to distinguish between IPv4
 97 | and IPv6.
 98 | 
 99 | `rel: -1` indicates customer-provider, and the direction of the relationship is modeled
100 | as `provider -> customer` to be consistent with `bgpkit.as2rel`.
101 | 
102 | `rel: 0` indicates peer-to-peer relationship.
103 | 
104 | **Note:** While both CAIDA and BGPKIT use `rel: 0` to indicate a peer-to-peer
105 | relationship, BGPKIT uses `rel: 1` for customer-provider, whereas CAIDA uses `rel: -1`.
106 | 
107 | ### Dependence
108 | 
109 | The as_relatonship crawler does not depend on other crawlers.
110 | 
111 | ## AS Organizations (as2org.py)
112 | 
113 | AS to organization mappings derived from the quarterly WHOIS dumps.
114 | 
115 | ### Graph representation
116 | 
117 | ```cypher
118 | (:AS {asn: 2497})-[:MANAGED_BY {org_id: '@aut-2497-JPNIC'}]->(:Organization {name: 'Internet Initiative Japan Inc.'})
119 | (:Organization {name: 'Internet Initiative Japan Inc.'})-[:COUNTRY {org_ids: ['@aut-2497-JPNIC']}]->(:Country {country_code: 'JP'})
120 | (:Organization {name: 'Internet Initiative Japan Inc.'})-[:NAME {org_ids: ['@aut-2497-JPNIC']}]->(:Name {name: 'Internet Initiative Japan Inc.'})
121 | (:Organization {name: 'Internet Initiative Japan Inc.'})-[:EXTERNAL_ID]->(:CaidaOrgID {id: '@aut-2497-JPNIC'})
122 | ```
123 | 
124 | We keep track of the org_id assigned by CAIDA in both the relationships and by adding a
125 | `CaidaOrgID` node. The main reason for this is that there are organizations with the
126 | same name but different IDs that may map to different countries.
127 | 
128 | ### Dependence
129 | 
130 | The as2org crawler does not depend on other crawlers.
131 | 


--------------------------------------------------------------------------------
/iyp/crawlers/caida/__init__.py:
--------------------------------------------------------------------------------
 1 | import bz2
 2 | import logging
 3 | import os
 4 | from datetime import datetime, timezone
 5 | from io import BytesIO
 6 | 
 7 | import requests
 8 | from bs4 import BeautifulSoup
 9 | 
10 | from iyp import BaseCrawler
11 | 
12 | 
13 | class ASRelCrawler(BaseCrawler):
14 |     def __init__(self, organization, url, name, af):
15 |         super().__init__(organization, url, name)
16 |         self.af = af
17 |         self.reference['reference_url_info'] = \
18 |             'https://publicdata.caida.org/datasets/as-relationships/serial-1/README.txt'
19 | 
20 |     def __get_latest_file(self):
21 |         index = requests.get(self.reference['reference_url_data'])
22 |         index.raise_for_status()
23 |         soup = BeautifulSoup(index.text, features='html.parser')
24 |         if self.af == 4:
25 |             filename_template = '%Y%m%d.as-rel.txt.bz2'
26 |         else:
27 |             filename_template = '%Y%m%d.as-rel.v6-stable.txt.bz2'
28 |         links = soup.find_all('a')
29 |         file_dates = list()
30 |         for link in links:
31 |             try:
32 |                 dt = datetime.strptime(link['href'], filename_template).replace(tzinfo=timezone.utc)
33 |             except ValueError:
34 |                 continue
35 |             file_dates.append((dt, link['href']))
36 |         file_dates.sort()
37 |         latest_file_date, latest_file_name = file_dates[-1]
38 |         self.reference['reference_time_modification'] = latest_file_date
39 |         self.reference['reference_url_data'] = os.path.join(self.reference['reference_url_data'], latest_file_name)
40 |         logging.info(f'Fetching file: {self.reference["reference_url_data"]}')
41 | 
42 |     def run(self):
43 |         self.__get_latest_file()
44 |         req = requests.get(self.reference['reference_url_data'])
45 |         req.raise_for_status()
46 | 
47 |         with bz2.open(BytesIO(req.content), 'rb') as f:
48 |             text = f.read().decode()
49 | 
50 |         ases = set()
51 |         peers_with_links = list()
52 |         for line in text.splitlines():
53 |             if line.startswith('#'):
54 |                 continue
55 |             left_asn, right_asn, kind = map(int, line.split('|'))
56 |             ases.add(left_asn)
57 |             ases.add(right_asn)
58 |             peers_with_links.append({'src_id': left_asn, 'dst_id': right_asn,
59 |                                      'props': [self.reference, {'rel': kind, 'af': self.af}]})
60 | 
61 |         as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', ases)
62 | 
63 |         for link in peers_with_links:
64 |             link['src_id'] = as_id[link['src_id']]
65 |             link['dst_id'] = as_id[link['dst_id']]
66 | 
67 |         self.iyp.batch_add_links('PEERS_WITH', peers_with_links)
68 | 
69 |     def unit_test(self):
70 |         return super().unit_test(['PEERS_WITH'])
71 | 


--------------------------------------------------------------------------------
/iyp/crawlers/caida/as_relationships_v4.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.caida import ASRelCrawler
 6 | 
 7 | URL = 'https://publicdata.caida.org/datasets/as-relationships/serial-1/'
 8 | ORG = 'CAIDA'
 9 | NAME = 'caida.as_relationships_v4'
10 | 
11 | 
12 | class Crawler(ASRelCrawler):
13 |     def __init__(self, organization, url, name):
14 |         super().__init__(organization, url, name, 4)
15 | 
16 | 
17 | def main() -> None:
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('--unit-test', action='store_true')
20 |     args = parser.parse_args()
21 | 
22 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
23 |     logging.basicConfig(
24 |         format=FORMAT,
25 |         filename='log/' + NAME + '.log',
26 |         level=logging.INFO,
27 |         datefmt='%Y-%m-%d %H:%M:%S'
28 |     )
29 | 
30 |     logging.info(f'Started: {sys.argv}')
31 | 
32 |     crawler = Crawler(ORG, URL, NAME)
33 |     if args.unit_test:
34 |         crawler.unit_test()
35 |     else:
36 |         crawler.run()
37 |         crawler.close()
38 |     logging.info(f'Finished: {sys.argv}')
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 |     sys.exit(0)
44 | 


--------------------------------------------------------------------------------
/iyp/crawlers/caida/as_relationships_v6.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.caida import ASRelCrawler
 6 | 
 7 | URL = 'https://publicdata.caida.org/datasets/as-relationships/serial-1/'
 8 | ORG = 'CAIDA'
 9 | NAME = 'caida.as_relationships_v6'
10 | 
11 | 
12 | class Crawler(ASRelCrawler):
13 |     def __init__(self, organization, url, name):
14 |         super().__init__(organization, url, name, 6)
15 | 
16 | 
17 | def main() -> None:
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('--unit-test', action='store_true')
20 |     args = parser.parse_args()
21 | 
22 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
23 |     logging.basicConfig(
24 |         format=FORMAT,
25 |         filename='log/' + NAME + '.log',
26 |         level=logging.INFO,
27 |         datefmt='%Y-%m-%d %H:%M:%S'
28 |     )
29 | 
30 |     logging.info(f'Started: {sys.argv}')
31 | 
32 |     crawler = Crawler(ORG, URL, NAME)
33 |     if args.unit_test:
34 |         crawler.unit_test()
35 |     else:
36 |         crawler.run()
37 |         crawler.close()
38 |     logging.info(f'Finished: {sys.argv}')
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 |     sys.exit(0)
44 | 


--------------------------------------------------------------------------------
/iyp/crawlers/caida/ix_asns.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | import sys
  5 | from datetime import datetime, timezone
  6 | 
  7 | import arrow
  8 | import flatdict
  9 | import requests
 10 | 
 11 | from iyp import BaseCrawler
 12 | 
 13 | URL = 'https://publicdata.caida.org/datasets/ixps/'
 14 | ORG = 'CAIDA'
 15 | NAME = 'caida.ix_asns'
 16 | 
 17 | 
 18 | class Crawler(BaseCrawler):
 19 |     def __init__(self, organization, url, name):
 20 |         """Initialization: Find the latest file and set the URL"""
 21 | 
 22 |         date = arrow.now()
 23 | 
 24 |         for _ in range(6):
 25 |             full_url = url + f'ix-asns_{date.year}{date.month:02d}.jsonl'
 26 |             req = requests.head(full_url)
 27 | 
 28 |             # Found the latest file
 29 |             if req.status_code == 200:
 30 |                 url = full_url
 31 |                 break
 32 | 
 33 |             date = date.shift(months=-1)
 34 | 
 35 |         else:
 36 |             # for loop was not 'broken', no file available
 37 |             raise Exception('No recent CAIDA ix-asns file available')
 38 |         date = date.datetime.replace(day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc)
 39 | 
 40 |         logging.info(f'Fetching data from: {url}')
 41 |         super().__init__(organization, url, name)
 42 |         self.reference['reference_url_info'] = 'https://publicdata.caida.org/datasets/ixps/README.txt'
 43 |         self.reference['reference_time_modification'] = date
 44 | 
 45 |     def __set_modification_time_from_metadata_line(self, line):
 46 |         try:
 47 |             date_str = json.loads(line.lstrip('#'))['date']
 48 |             date = datetime.strptime(date_str, '%Y.%m.%d %H:%M:%S').replace(tzinfo=timezone.utc)
 49 |             self.reference['reference_time_modification'] = date
 50 |         except (json.JSONDecodeError, KeyError, ValueError) as e:
 51 |             logging.warning(f'Failed to get modification date from metadata line: {line.strip()}')
 52 |             logging.warning(e)
 53 |             logging.warning('Using date from filename.')
 54 | 
 55 |     def run(self):
 56 |         """Fetch the latest file and process lines one by one."""
 57 | 
 58 |         req = requests.get(self.url)
 59 |         req.raise_for_status()
 60 | 
 61 |         lines = []
 62 |         asns = set()
 63 | 
 64 |         # Find all possible values and create corresponding nodes
 65 |         for line in req.text.splitlines():
 66 |             if line.startswith('#'):
 67 |                 self.__set_modification_time_from_metadata_line(line)
 68 |                 continue
 69 | 
 70 |             ix = json.loads(line)
 71 |             lines.append(ix)
 72 |             asns.add(int(ix.get('asn')))
 73 | 
 74 |         # get node IDs for ASNs, names, and countries
 75 |         ixp_id = self.iyp.batch_get_node_extid('CaidaIXID')
 76 |         as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
 77 | 
 78 |         # Compute links and add them to neo4j
 79 |         member_links = []
 80 | 
 81 |         for mem in lines:
 82 |             ixp_qid = ixp_id.get(mem['ix_id'])
 83 |             asn_qid = as_id.get(mem['asn'])
 84 |             flat_mem = dict(flatdict.FlatDict(mem))
 85 | 
 86 |             member_links.append({'src_id': asn_qid, 'dst_id': ixp_qid,
 87 |                                  'props': [self.reference, flat_mem]})
 88 |         # Push all links to IYP
 89 |         self.iyp.batch_add_links('MEMBER_OF', member_links)
 90 | 
 91 |     def unit_test(self):
 92 |         return super().unit_test(['MEMBER_OF'])
 93 | 
 94 | 
 95 | def main() -> None:
 96 |     parser = argparse.ArgumentParser()
 97 |     parser.add_argument('--unit-test', action='store_true')
 98 |     args = parser.parse_args()
 99 | 
100 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
101 |     logging.basicConfig(
102 |         format=FORMAT,
103 |         filename='log/' + NAME + '.log',
104 |         level=logging.INFO,
105 |         datefmt='%Y-%m-%d %H:%M:%S'
106 |     )
107 | 
108 |     logging.info(f'Started: {sys.argv}')
109 | 
110 |     crawler = Crawler(ORG, URL, NAME)
111 |     if args.unit_test:
112 |         crawler.unit_test()
113 |     else:
114 |         crawler.run()
115 |         crawler.close()
116 |     logging.info(f'Finished: {sys.argv}')
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     main()
121 |     sys.exit(0)
122 | 


--------------------------------------------------------------------------------
/iyp/crawlers/cisco/README.md:
--------------------------------------------------------------------------------
 1 | # Cisco Umbrella -- https://umbrella-static.s3-us-west-1.amazonaws.com/index.html
 2 | 
 3 | The popularity list contains most queried domains (ranging from TLDs to FQDNs)
 4 | based on passive DNS usage across the Umbrella global network.
 5 | 
 6 | IYP uses this data to create and annotate DomainName and HostName nodes.
 7 | 
 8 | ## Graph representation
 9 | 
10 | The rank of the domain is indicated by the `rank` property of the relationship.
11 | 
12 | ```Cypher
13 | (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
14 | (:HostName {name: 'www.google.com'})-[:RANK {rank: 8}]->(:Ranking {name: 'Cisco Umbrella Top 1 million'})
15 | ```
16 | 
17 | ## Dependence
18 | 
19 | This crawler depends on `openintel.umbrella1m`.
20 | 


--------------------------------------------------------------------------------
/iyp/crawlers/citizenlab/README.md:
--------------------------------------------------------------------------------
 1 | # Citizen Lab -- https://github.com/citizenlab/test-lists/blob/master/lists/
 2 | 
 3 | Citizen lab's test lists are URL testing lists intended to help in testing URL censorship, divided by country codes. In addition to these local lists, the global list consists of a wide range of internationally relevant and popular websites, including sites with content that is perceived to be provocative or objectionable. The dataset tha maps URL to Category.
 4 | 
 5 | ## Graph representation
 6 | 
 7 | ### URL tags
 8 | Connect URL to tag nodes meaning that an URL has been categorized according to the
 9 | given tag.
10 | ```
11 | (:URL {url: "https://www.flipkart.com/"})-[:CATEGORIZED]->(:Tag {label: 'COMM'})
12 | ```
13 | 
14 | ## Dependence
15 | 
16 | This crawler is not depending on other crawlers.
17 | 


--------------------------------------------------------------------------------
/iyp/crawlers/citizenlab/urldb.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import sys
  4 | from urllib.error import HTTPError
  5 | 
  6 | import pandas as pd
  7 | 
  8 | from iyp import BaseCrawler, RequestStatusError
  9 | 
 10 | ORG = 'Citizen Lab'
 11 | URL = 'https://github.com/citizenlab/test-lists/blob/master/lists/'
 12 | NAME = 'citizenlab.urldb'
 13 | 
 14 | 
 15 | def generate_url(suffix):
 16 |     base_url = 'https://raw.githubusercontent.com/citizenlab/test-lists/master/lists/'
 17 |     joined_url = ''.join([base_url, suffix, '.csv'])
 18 |     return joined_url
 19 | 
 20 | 
 21 | class Crawler(BaseCrawler):
 22 |     def __init__(self, organization, url, name):
 23 |         super().__init__(organization, url, name)
 24 |         self.reference['reference_url_info'] = 'https://github.com/citizenlab/test-lists'
 25 | 
 26 |     def run(self):
 27 |         # Fetch country codes to generate urls
 28 |         try:
 29 |             cc_df = pd.read_csv(generate_url('00-LEGEND-country_codes'), keep_default_na=False)
 30 |         except Exception as e:
 31 |             logging.error(f'Failed to fetch country codes: {e}')
 32 |             raise RequestStatusError('Error while fetching data file')
 33 | 
 34 |         country_codes = [e.lower() for e in cc_df['CountryCode']]
 35 | 
 36 |         # Iterate through country_codes, generate an url, download the csv file, extract
 37 |         # the necessary information from the csv file, and push the data to IYP.
 38 |         relationship_pairs = set()
 39 |         urls = set()
 40 |         categories = set()
 41 | 
 42 |         for code in country_codes:
 43 |             # Not all country codes have CSV files.
 44 |             try:
 45 |                 df = pd.read_csv(generate_url(code))
 46 |             except HTTPError as e:
 47 |                 # 404 is expected, everything else is not.
 48 |                 if e.getcode() != 404:
 49 |                     logging.warning(f'Request for country code "{code}" failed with error: {e}')
 50 |                     raise e
 51 |                 continue
 52 | 
 53 |             for row in df.itertuples():
 54 |                 url = row.url
 55 |                 category = row.category_description
 56 |                 urls.add(url)
 57 |                 categories.add(category)
 58 |                 relationship_pairs.add((url, category))
 59 | 
 60 |         url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url', urls, all=False)
 61 |         category_id = self.iyp.batch_get_nodes_by_single_prop('Tag', 'label', categories, all=False)
 62 | 
 63 |         links = list()
 64 |         for (url, category) in relationship_pairs:
 65 |             url_qid = url_id[url]
 66 |             category_qid = category_id[category]
 67 |             links.append({'src_id': url_qid, 'dst_id': category_qid, 'props': [self.reference]})
 68 | 
 69 |         # Push all links to IYP
 70 |         self.iyp.batch_add_links('CATEGORIZED', links)
 71 | 
 72 |     def unit_test(self):
 73 |         return super().unit_test(['CATEGORIZED'])
 74 | 
 75 | 
 76 | def main() -> None:
 77 |     parser = argparse.ArgumentParser()
 78 |     parser.add_argument('--unit-test', action='store_true')
 79 |     args = parser.parse_args()
 80 | 
 81 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
 82 |     logging.basicConfig(
 83 |         format=FORMAT,
 84 |         filename='log/' + NAME + '.log',
 85 |         level=logging.INFO,
 86 |         datefmt='%Y-%m-%d %H:%M:%S'
 87 |     )
 88 | 
 89 |     logging.info(f'Started: {sys.argv}')
 90 | 
 91 |     crawler = Crawler(ORG, URL, NAME)
 92 |     if args.unit_test:
 93 |         crawler.unit_test()
 94 |     else:
 95 |         crawler.run()
 96 |         crawler.close()
 97 |     logging.info(f'Finished: {sys.argv}')
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     main()
102 |     sys.exit(0)
103 | 


--------------------------------------------------------------------------------
/iyp/crawlers/cloudflare/README.md:
--------------------------------------------------------------------------------
 1 | # Cloudflare Radar -- https://radar.cloudflare.com/
 2 | 
 3 | Cloudflare uses aggregated and anonymized DNS queries to their `1.1.1.1` public resolver service to
 4 | provide various datasets, including:
 5 | 
 6 | - [Ordered top 100 domain
 7 |   names](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/): The 100
 8 |   domains with the most DNS queries, including assigned ranks.
 9 | - [Unordered top 1,000 to 1,000,000
10 |   domains](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/): Same as
11 |   above, but there are no ranks assigned. Fetched set sizes are 1,000, 2,000, 5,000, 10,000, 20,000,
12 |   50,000, 100,000, 200,000, 500,000, and 1,000,000.
13 | - [Top 100 countries querying each of the 10,000 highest ranked domain
14 |   names](https://developers.cloudflare.com/radar/investigate/dns/#top-locations): For each domain
15 |   that is in the top 10,000 of *any* ranking included in IYP, fetch the top 100 countries with the
16 |   most DNS queries.
17 | - [Top 100 ASes querying each of the 10,000 highest ranked domain
18 |   names](https://developers.cloudflare.com/api/operations/radar-get-dns-top-ases): Same as above, but
19 |   fetch AS numbers instead.
20 | 
21 | All rankings are based on one week of data.
22 | Cloudflare radar's top location and ASes is available for both domain names
23 | and host names. Results are likely accounting for all NS, A, AAAA queries made to
24 | Cloudflare's resolver. Since NS queries for host names make no sense IYP links these
25 | results to `DomainName` nodes.
26 | 
27 | ## Graph representation
28 | 
29 | ### Ordered top 100 domain names - `top100.py`
30 | 
31 | Connect DomainName nodes to a single Ranking node corresponding to the ordered Cloudflare top 100
32 | ranking. The `rank` is given as a relationship property.
33 | 
34 | ```Cypher
35 | (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Cloudflare top 100 domains'})
36 | ```
37 | 
38 | ### Unordered top *n* domain names - `ranking_bucket.py`
39 | 
40 | Connect DomainName nodes to a single Ranking node corresponding to the unordered Cloudflare top *n*
41 | ranking. There is no rank assigned to the domain name, but *n* is specified in the `top` property of
42 | the Ranking node.
43 | 
44 | ```Cypher
45 | (:DomainName {name: 'google.com'})-[:RANK]->(:Ranking {name: 'Cloudflare Top 1000 ranking domains', top: 1000})
46 | ```
47 | 
48 | ### Top countries - `dns_top_locations.py`
49 | 
50 | Connect each DomainName node to up to 100 Country nodes representing the countries from which the
51 | domain was queried from the most. The `value` property of the QUERIED_FROM relationship describes
52 | the percentage of all queries (within one week) originating from the country.
53 | 
54 | ```Cypher
55 | (:DomainName {name: 'google.com'})-[:QUERIED_FROM {value: 37.05}]->(:Country {country_code: 'US'})
56 | ```
57 | 
58 | ### Top ASes - `dns_top_ases.py`
59 | 
60 | Connect each DomainName node to up to 100 AS nodes  from which the domain was queried from the most.
61 | The `value` property of the QUERIED_FROM relationship describes the percentage of all queries
62 | (within one week) originating from the AS.
63 | 
64 | ```Cypher
65 | (:DomainName {name: 'google.com'}-[:QUERIED_FROM {value: 3.51}]->(:AS {asn: 714}))
66 | ```
67 | 
68 | ## Dependence
69 | 
70 | The `dns_top_locations` and `dns_top_ases` crawlers should be run after all crawlers that produce
71 | `(:DomainName)-[:RANK {rank: n}]->(:Ranking)` relationships:
72 | 
73 | - `cloudflare.top100`
74 | - `tranco.top1m`
75 | 
76 | ## Notes
77 | 
78 | This crawler requires an application key to access the radar's API.
79 | 


--------------------------------------------------------------------------------
/iyp/crawlers/cloudflare/dns_top_ases.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | import flatdict
 6 | 
 7 | from iyp.crawlers.cloudflare import DnsTopCrawler
 8 | 
 9 | ORG = 'Cloudflare'
10 | URL = 'https://api.cloudflare.com/client/v4/radar/dns/top/ases/'
11 | NAME = 'cloudflare.dns_top_ases'
12 | 
13 | 
14 | class Crawler(DnsTopCrawler):
15 |     def __init__(self, organization, url, name):
16 |         super().__init__(organization, url, name)
17 | 
18 |         self.reference['reference_url_info'] = 'https://developers.cloudflare.com/api/operations/radar-get-dns-top-ases'
19 | 
20 |     def compute_link(self, param):
21 | 
22 |         name, ases = param
23 | 
24 |         # 'meta' result it not a domain, but contains metadata so skip.
25 |         if name == 'meta':
26 |             return
27 | 
28 |         qids = list()
29 |         if name in self.domain_names_id:
30 |             qids.append(self.domain_names_id[name])
31 |         if name in self.host_names_id:
32 |             qids.append(self.host_names_id[name])
33 | 
34 |         for entry in ases:
35 |             if not entry:
36 |                 continue
37 | 
38 |             asn = entry['clientASN']
39 |             self.to_nodes.add(asn)
40 | 
41 |             entry['value'] = float(entry['value'])
42 | 
43 |             flat_prop = dict(flatdict.FlatDict(entry))
44 |             for qid in qids:
45 |                 self.links.append({
46 |                     'src_id': qid,
47 |                     'dst_id': asn,
48 |                     'props': [flat_prop, self.reference]
49 |                 })
50 | 
51 |     def map_links(self):
52 |         as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', self.to_nodes, all=False)
53 |         for link in self.links:
54 |             link['dst_id'] = as_id[link['dst_id']]
55 | 
56 | 
57 | def main() -> None:
58 |     parser = argparse.ArgumentParser()
59 |     parser.add_argument('--unit-test', action='store_true')
60 |     args = parser.parse_args()
61 | 
62 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
63 |     logging.basicConfig(
64 |         format=FORMAT,
65 |         filename='log/' + NAME + '.log',
66 |         level=logging.INFO,
67 |         datefmt='%Y-%m-%d %H:%M:%S'
68 |     )
69 | 
70 |     logging.info(f'Started: {sys.argv}')
71 | 
72 |     crawler = Crawler(ORG, URL, NAME)
73 |     if args.unit_test:
74 |         crawler.unit_test()
75 |     else:
76 |         crawler.run()
77 |         crawler.close()
78 |     logging.info(f'Finished: {sys.argv}')
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     main()
83 |     sys.exit(0)
84 | 


--------------------------------------------------------------------------------
/iyp/crawlers/cloudflare/dns_top_locations.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | import flatdict
 6 | 
 7 | from iyp.crawlers.cloudflare import DnsTopCrawler
 8 | 
 9 | ORG = 'Cloudflare'
10 | URL = 'https://api.cloudflare.com/client/v4/radar/dns/top/locations/'
11 | NAME = 'cloudflare.dns_top_locations'
12 | 
13 | 
14 | class Crawler(DnsTopCrawler):
15 |     def __init__(self, organization, url, name):
16 |         super().__init__(organization, url, name)
17 | 
18 |         self.reference['reference_url_info'] = 'https://developers.cloudflare.com/radar/investigate/dns/#top-locations'
19 | 
20 |     def compute_link(self, param):
21 | 
22 |         domain, countries = param
23 | 
24 |         # 'meta' result it not a domain, but contains metadata so skip.
25 |         if domain == 'meta':
26 |             return
27 | 
28 |         domain_qid = self.domain_names_id[domain]
29 | 
30 |         for entry in countries:
31 |             if not entry:
32 |                 continue
33 | 
34 |             cc = entry['clientCountryAlpha2']
35 |             self.to_nodes.add(cc)
36 | 
37 |             entry['value'] = float(entry['value'])
38 | 
39 |             flat_prop = dict(flatdict.FlatDict(entry))
40 |             self.links.append({
41 |                 'src_id': domain_qid,
42 |                 'dst_id': cc,
43 |                 'props': [flat_prop, self.reference]
44 |             })
45 | 
46 |     def map_links(self):
47 |         cc_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', self.to_nodes, all=False)
48 |         for link in self.links:
49 |             link['dst_id'] = cc_id[link['dst_id']]
50 | 
51 | 
52 | def main() -> None:
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument('--unit-test', action='store_true')
55 |     args = parser.parse_args()
56 | 
57 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
58 |     logging.basicConfig(
59 |         format=FORMAT,
60 |         filename='log/' + NAME + '.log',
61 |         level=logging.INFO,
62 |         datefmt='%Y-%m-%d %H:%M:%S'
63 |     )
64 | 
65 |     logging.info(f'Started: {sys.argv}')
66 | 
67 |     crawler = Crawler(ORG, URL, NAME)
68 |     if args.unit_test:
69 |         crawler.unit_test()
70 |     else:
71 |         crawler.run()
72 |         crawler.close()
73 |     logging.info(f'Finished: {sys.argv}')
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     main()
78 |     sys.exit(0)
79 | 


--------------------------------------------------------------------------------
/iyp/crawlers/cloudflare/top100.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | import os
  5 | import sys
  6 | from datetime import datetime, timezone
  7 | 
  8 | import requests
  9 | 
 10 | from iyp import BaseCrawler
 11 | 
 12 | # Organization name and URL to data
 13 | ORG = 'Cloudflare'
 14 | URL = 'https://api.cloudflare.com/client/v4/radar/ranking/top?name=top&limit=100&format=json'
 15 | NAME = 'cloudflare.top100'
 16 | 
 17 | API_KEY = ''
 18 | if os.path.exists('config.json'):
 19 |     API_KEY = json.load(open('config.json', 'r'))['cloudflare']['apikey']
 20 | 
 21 | 
 22 | class Crawler(BaseCrawler):
 23 |     # Base Crawler provides access to IYP via self.iyp
 24 |     # and setup a dictionary with the org/url/today's date in self.reference
 25 |     #
 26 |     # Cloudflare ranks second and third level domain names (not host names).
 27 |     # See https://blog.cloudflare.com/radar-domain-rankings/
 28 |     def __init__(self, organization, url, name):
 29 |         super().__init__(organization, url, name)
 30 |         self.reference['reference_url_info'] = 'https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/'  # noqa: E501
 31 | 
 32 |     def run(self):
 33 |         """Fetch data and push to IYP."""
 34 | 
 35 |         self.cf_qid = self.iyp.get_node(
 36 |             'Ranking', {'name': 'Cloudflare top 100 domains'})
 37 | 
 38 |         # Fetch data
 39 |         headers = {
 40 |             'Authorization': 'Bearer ' + API_KEY,
 41 |             'Content-Type': 'application/json'
 42 |         }
 43 | 
 44 |         req = requests.get(self.reference['reference_url_data'], headers=headers)
 45 |         req.raise_for_status()
 46 | 
 47 |         results = req.json()['result']
 48 | 
 49 |         try:
 50 |             date_str = results['meta']['dateRange'][0]['endTime']
 51 |             date = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
 52 |             self.reference['reference_time_modification'] = date
 53 |         except (KeyError, ValueError, TypeError) as e:
 54 |             logging.warning(f'Failed to get modification time: {e}')
 55 | 
 56 |         # Process line one after the other
 57 |         processed = list(map(self.update, results['top']))
 58 |         logging.info(f'Processed {len(processed)} lines')
 59 | 
 60 |     def update(self, entry):
 61 |         """Add the entry to IYP if it's not already there and update its properties."""
 62 | 
 63 |         # set rank
 64 |         statements = [['RANK', self.cf_qid, dict({'rank': entry['rank']}, **self.reference)]]
 65 | 
 66 |         # Commit to IYP
 67 |         # Get the AS's node ID (create if it is not yet registered) and commit changes
 68 |         domain_qid = self.iyp.get_node('DomainName', {'name': entry['domain']})
 69 |         self.iyp.add_links(domain_qid, statements)
 70 | 
 71 |     def unit_test(self):
 72 |         return super().unit_test(['RANK'])
 73 | 
 74 | 
 75 | def main() -> None:
 76 |     parser = argparse.ArgumentParser()
 77 |     parser.add_argument('--unit-test', action='store_true')
 78 |     args = parser.parse_args()
 79 | 
 80 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
 81 |     logging.basicConfig(
 82 |         format=FORMAT,
 83 |         filename='log/' + NAME + '.log',
 84 |         level=logging.INFO,
 85 |         datefmt='%Y-%m-%d %H:%M:%S'
 86 |     )
 87 | 
 88 |     logging.info(f'Started: {sys.argv}')
 89 | 
 90 |     crawler = Crawler(ORG, URL, NAME)
 91 |     if args.unit_test:
 92 |         crawler.unit_test()
 93 |     else:
 94 |         crawler.run()
 95 |         crawler.close()
 96 |     logging.info(f'Finished: {sys.argv}')
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     main()
101 |     sys.exit(0)
102 | 


--------------------------------------------------------------------------------
/iyp/crawlers/emileaben/README.md:
--------------------------------------------------------------------------------
 1 | # EmileAben's AS Names -- https://github.com/emileaben/asnames
 2 | 
 3 | Data collected by EmileAben's AS Names, including:
 4 | 
 5 | - AS numbers
 6 | - AS names
 7 | 
 8 | ## Graph representation
 9 | 
10 | ### AS names
11 | 
12 | Connect AS to names nodes, providing the name of an AS.
13 | For example:
14 | 
15 | ```Cypher
16 | (:AS {asn: 2497})-[:NAME]-(:Name {name: 'IIJ'})
17 | ```
18 | 
19 | ## Dependence
20 | 
21 | This crawler is not depending on other crawlers.
22 | 


--------------------------------------------------------------------------------
/iyp/crawlers/emileaben/as_names.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import os
 4 | import sys
 5 | import tempfile
 6 | 
 7 | import requests
 8 | 
 9 | from iyp import BaseCrawler, get_commit_datetime
10 | 
11 | # Organization name and URL to data
12 | ORG = 'emileaben'
13 | URL = 'https://raw.githubusercontent.com/emileaben/asnames/main/asnames.csv'
14 | NAME = 'emileaben.as_names'  # should reflect the directory and name of this file
15 | 
16 | 
17 | class Crawler(BaseCrawler):
18 |     # Base Crawler provides access to IYP via self.iyp
19 |     # and setup a dictionary with the org/url/today's date in self.reference
20 |     def __init__(self, organization, url, name):
21 |         super().__init__(organization, url, name)
22 |         self.reference['reference_url_info'] = 'https://github.com/emileaben/asnames'
23 |         self.reference['reference_time_modification'] = get_commit_datetime('emileaben/asnames', 'asnames.csv')
24 | 
25 |     def run(self):
26 |         # Create a temporary directory
27 |         tmpdir = tempfile.mkdtemp()
28 |         filename = os.path.join(tmpdir, 'as_names.txt')
29 | 
30 |         res = requests.get(URL)
31 |         res.raise_for_status()
32 | 
33 |         with open(filename, 'w') as file:
34 |             file.write(res.text)
35 | 
36 |         lines = []
37 |         asns = set()
38 |         as_names = set()
39 | 
40 |         with open(filename, 'r') as file:
41 |             for line in file:
42 |                 line = line.strip()
43 |                 values = line.split('|')
44 |                 as_number = values[0]
45 |                 as_name = values[2]
46 |                 asns.add(int(as_number))
47 |                 as_names.add(as_name)
48 |                 lines.append(values)
49 | 
50 |             asns_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
51 |             as_names_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', as_names, all=False)
52 | 
53 |             links = []
54 | 
55 |             for line in lines:
56 |                 asn_qid = asns_id[int(line[0])]
57 |                 as_name_qid = as_names_id[line[2]]
58 |                 links.append(
59 |                     {'src_id': asn_qid, 'dst_id': as_name_qid, 'props': [self.reference, {'contributor': line[1]}]})
60 | 
61 |             # Push all links to IYP
62 |             self.iyp.batch_add_links('NAME', links)
63 | 
64 |     def unit_test(self):
65 |         return super().unit_test(['NAME'])
66 | 
67 | 
68 | def main() -> None:
69 |     parser = argparse.ArgumentParser()
70 |     parser.add_argument('--unit-test', action='store_true')
71 |     args = parser.parse_args()
72 | 
73 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
74 |     logging.basicConfig(
75 |         format=FORMAT,
76 |         filename='log/' + NAME + '.log',
77 |         level=logging.INFO,
78 |         datefmt='%Y-%m-%d %H:%M:%S'
79 |     )
80 | 
81 |     logging.info(f'Started: {sys.argv}')
82 | 
83 |     crawler = Crawler(ORG, URL, NAME)
84 |     if args.unit_test:
85 |         crawler.unit_test()
86 |     else:
87 |         crawler.run()
88 |         crawler.close()
89 |     logging.info(f'Finished: {sys.argv}')
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     main()
94 |     sys.exit(0)
95 | 


--------------------------------------------------------------------------------
/iyp/crawlers/example/README.md:
--------------------------------------------------------------------------------
 1 | # Example Crawler
 2 | 
 3 | This is an example of crawler where nodes/relationships are pushed in batches. It is not
 4 | a working example but can be used as a template. For a simple working example see
 5 | [here](../bgpkit/pfx2asn.py).
 6 | 
 7 | While there are methods to get/create individual nodes they should only be used in rare
 8 | cases, as batch creation is almost always faster.
 9 | 
10 | The first paragraph of this readme should be a description of the dataset and give an
11 | overview of the parts we push to IYP.
12 | 
13 | ## Graph representation
14 | 
15 | Connect AS nodes to EXAMPLE_NODE_LABEL with EXAMPLE_RELATIONSHIP_LABEL relationship.
16 | 
17 | ```cypher
18 | (:AS)-[:EXAMPLE_RELATIONSHIP_LABEL]-(:EXAMPLE_NODE_LABEL)
19 | ```
20 | 
21 | ## Dependence
22 | 
23 | This crawler is not depending on other crawlers.
24 | 


--------------------------------------------------------------------------------
/iyp/crawlers/google/README.md:
--------------------------------------------------------------------------------
 1 | # Google CrUX -- https://developer.chrome.com/docs/crux
 2 | 
 3 | The Chrome User Experience Report (CrUX for short) is a dataset collected by
 4 | Google that reflects how real-world Chrome users experience popular
 5 | destinations on the web.
 6 | 
 7 | CrUX data is collected from real browsers around the world, based on certain
 8 | browser options which determine user eligibility. A set of dimensions and metrics
 9 | are collected which allow site owners to determine how users experience their sites.
10 | 
11 | IYP fetches CrUX's [top 1M popular websites per country](https://github.com/InternetHealthReport/crux-top-lists-country).
12 | Unlike others, CrUX rankings are bucketed by rank magnitude order, not by
13 | specific rank. For example, ranks are 1000, 10k, 100k, or 1M.
14 | 
15 | In addition, CrUX ranks *origins* (e.g., https://www.google.com), not domain
16 | or host names. In IYP we extract the hostname part of the origin and model this
17 | dataset using the hostname.
18 | 
19 | ## Graph representation
20 | 
21 | ```cypher
22 | (:HostName {name:'www.iij.ad.jp'})-[:RANK {rank: 50000, origin:'https://www.iij.ad.jp'}]-(r:Ranking {name:'CrUX top 1M (JP)'})-[:COUNTRY]-(:Country {country_code:'JP'})
23 | ```
24 | 
25 | The `RANK` relationship contains the property `origin` to recover the origin
26 | given in the original dataset.
27 | 
28 | ## Dependence
29 | 
30 | This crawler is not depending on other crawlers.
31 | 


--------------------------------------------------------------------------------
/iyp/crawlers/iana/README.md:
--------------------------------------------------------------------------------
 1 | # IANA -- https://www.iana.org/
 2 | 
 3 | The Internet Assigned Numbers Authority (IANA) is responsible for the global
 4 | coordination of the DNS Root, IP addressing, and other Internet protocol resources.
 5 | 
 6 | Datasets used by IYP:
 7 | 
 8 | - DNS [root zone file](https://www.iana.org/domains/root/files) to retrieve information
 9 |   about authoritative name servers of the top-level domains as well as their IP
10 |   addresses.
11 | 
12 | ## Graph representation
13 | 
14 | ### Root zone file -  `root_zone.py`
15 | 
16 | IYP imports `NS`, `A`, and `AAAA` records from the root zone file.
17 | 
18 | ```Cypher
19 | // NS record
20 | (:DomainName {name: 'jp'})-[:MANAGED_BY]->(:DomainName:AuthoritativeNameServer {name: 'a.dns.jp'})
21 | // A record
22 | (:DomainName:AuthoritativeNameServer {name: 'a.dns.jp'})-[:RESOLVES_TO]->(:IP {ip: '203.119.1.1'})
23 | // AAAA record
24 | (:DomainName:AuthoritativeNameServer {name: 'a.dns.jp'})-[:RESOLVES_TO]->(:IP {ip: '2001:dc4::1'})
25 | ```
26 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ihr/README.md:
--------------------------------------------------------------------------------
 1 | # Internet Health Report -- https://ihr.iijlab.net/
 2 | 
 3 | Data inferred by IIJ's Internet Health Report, including:
 4 | 
 5 | - AS inter-dependency
 6 | - country's AS dependency
 7 | - prefixes' RPKI and IRR status
 8 | 
 9 | #### Country's AS dependency
10 | The way to retrieve the country AS dependency values shown on IHR website (e.g. https://ihr.iijlab.net/ihr/en-us/countries/JP) is as follow.
11 | For eyeball ranking nodes get `hege` and `weight` values from the corresponding RANK relationship and then:
12 | - Population Total = 100*`hege`
13 | - Population Direct = `weight`
14 | - Population Indirect = 100*`hege`-`weight`
15 | 
16 | For AS ranking nodes get `hege` values from the corresponding RANK relationship and then:
17 | - AS Total = 100*`hege`
18 | 
19 | The values are not exactly the same as the ones shown on the IHR website because the IHR website averages results over three days.
20 | 
21 | 
22 | 
23 | ## Graph representation
24 | 
25 | ### AS inter-dependency - `local_hegemony.py`
26 | 
27 | Connect ASes that are depending on each other, meaning that an AS is commonly found on routes
28 | towards an origin AS. The strength of the dependence is given by the `hege` link attribute (AS
29 | Hegemony value) which range from 0 to 1. Strongest dependencies being equal to 1.
30 | 
31 | ```Cypher
32 | (:AS {asn: 2907})-[:DEPENDS_ON {hege: 0.82}]-(:AS {asn: 2497})
33 | ```
34 | 
35 | ### Country's AS dependency - `country_dependency.py`
36 | 
37 | Connect ASes to ranking nodes which are also connected to a country.  A Country AS dependency is
38 | computed in two different ways, emphasizing either the distribution of the country's population
39 | (a.k.a. Total eyeball) or the country ASes (a.k.a. Total AS), for example:
40 | 
41 | ```Cypher
42 | (:AS  {asn: 2497})-[:RANK {rank: 1, hege: 0.19}]->
43 | (:Ranking {name: 'IHR country ranking: Total AS (JP)'})-[:COUNTRY]->
44 | (:Country {country_code: 'JP'})
45 | ```
46 | 
47 | means that Japan ASes depends strongly (AS Hegemony equals 0.19) on AS2497.
48 | 
49 | ### Prefixes' RPKI and IRR status - `rov.py`
50 | 
51 | Connect prefixes to their origin AS, their AS dependencies, their RPKI/IRR status, and their country
52 | (provided by Maxmind).
53 | 
54 | ```Cypher
55 | (:BGPPrefix {prefix: '8.8.8.0/24'})<-[:ORIGINATE]-(:AS {asn: 15169})
56 | (:BGPPrefix {prefix: '8.8.8.0/24'})-[:DEPENDS_ON]->(:AS {asn: 15169})
57 | (:BGPPrefix {prefix: '8.8.8.0/24'})-[:CATEGORIZED]->(:Tag {label: 'RPKI Valid'})
58 | (:BGPPrefix {prefix: '8.8.8.0/24'})-[:COUNTRY]->(:Country {country_code: 'US'})
59 | ```
60 | 
61 | Tag labels (possibly) added by this crawler:
62 | 
63 | - `RPKI Valid`
64 | - `RPKI Invalid`
65 | - `RPKI Invalid,more-specific`
66 | - `RPKI NotFound`
67 | - `IRR Valid`
68 | - `IRR Invalid`
69 | - `IRR Invalid,more-specific`
70 | - `IRR NotFound`
71 | 
72 | The country geo-location is provided by Maxmind.
73 | 
74 | ## Dependence
75 | 
76 | These crawlers are not depending on other crawlers.
77 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ihr/__init__.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import io
 3 | import logging
 4 | from datetime import datetime, timedelta, timezone
 5 | 
 6 | import lz4.frame
 7 | import requests
 8 | 
 9 | from iyp import BaseCrawler, DataNotAvailableError
10 | 
11 | 
12 | class HegemonyCrawler(BaseCrawler):
13 |     def __init__(self, organization, url, name, af):
14 |         self.af = af
15 |         super().__init__(organization, url, name)
16 |         self.reference['reference_url_info'] = 'https://ihr.iijlab.net/ihr/en-us/documentation#AS_dependency'
17 | 
18 |     def run(self):
19 |         """Fetch data from file and push to IYP."""
20 | 
21 |         today = datetime.now(tz=timezone.utc)
22 |         max_lookback = today - timedelta(days=7)
23 |         url = today.strftime(self.url)
24 |         req = requests.head(url)
25 |         while req.status_code != 200 and today > max_lookback:
26 |             today -= timedelta(days=1)
27 |             url = today.strftime(self.url)
28 |             req = requests.head(url)
29 |         if req.status_code != 200:
30 |             logging.error('Failed to find data within the specified lookback interval.')
31 |             raise DataNotAvailableError('Failed to find data within the specified lookback interval.')
32 | 
33 |         self.reference['reference_url_data'] = url
34 | 
35 |         logging.info(f'Fetching data from: {url}')
36 |         req = requests.get(url)
37 |         req.raise_for_status()
38 | 
39 |         # lz4.frame.decompress() and splitlines() break the CSV parsing due to some
40 |         # weird input.
41 |         with lz4.frame.open(io.BytesIO(req.content)) as f:
42 |             csv_lines = [l.decode('utf-8').rstrip() for l in f]
43 | 
44 |         timebin = None
45 |         asns = set()
46 |         links = list()
47 | 
48 |         logging.info('Computing links...')
49 |         for rec in csv.DictReader(csv_lines):
50 |             # header
51 |             # timebin,originasn,asn,hege
52 | 
53 |             rec['hege'] = float(rec['hege'])
54 |             rec['af'] = self.af
55 | 
56 |             if timebin is None:
57 |                 timebin = rec['timebin']
58 |                 mod_time = datetime.strptime(timebin, '%Y-%m-%d %H:%M:%S+00').replace(tzinfo=timezone.utc)
59 |                 self.reference['reference_time_modification'] = mod_time
60 |             elif timebin != rec['timebin']:
61 |                 break
62 | 
63 |             originasn = int(rec['originasn'])
64 |             asn = int(rec['asn'])
65 |             asns.add(originasn)
66 |             asns.add(asn)
67 | 
68 |             links.append({
69 |                 'src_id': originasn,
70 |                 'dst_id': asn,
71 |                 'props': [self.reference, rec]
72 |             })
73 | 
74 |         asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns, all=False)
75 |         # Replace values in links with node IDs.
76 |         for link in links:
77 |             link['src_id'] = asn_id[link['src_id']]
78 |             link['dst_id'] = asn_id[link['dst_id']]
79 | 
80 |         # Push links to IYP
81 |         self.iyp.batch_add_links('DEPENDS_ON', links)
82 | 
83 |     def unit_test(self):
84 |         return super().unit_test(['DEPENDS_ON'])
85 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ihr/local_hegemony_v4.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.ihr import HegemonyCrawler
 6 | 
 7 | # URL to the API
 8 | URL = 'https://archive.ihr.live/ihr/hegemony/ipv4/local/%Y/%m/%d/ihr_hegemony_ipv4_local_%Y-%m-%d.csv.lz4'
 9 | ORG = 'IHR'
10 | NAME = 'ihr.local_hegemony_v4'
11 | 
12 | 
13 | class Crawler(HegemonyCrawler):
14 |     def __init__(self, organization, url, name):
15 |         super().__init__(organization, url, name, af=4)
16 | 
17 | 
18 | def main() -> None:
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('--unit-test', action='store_true')
21 |     args = parser.parse_args()
22 | 
23 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
24 |     logging.basicConfig(
25 |         format=FORMAT,
26 |         filename='log/' + NAME + '.log',
27 |         level=logging.INFO,
28 |         datefmt='%Y-%m-%d %H:%M:%S'
29 |     )
30 | 
31 |     logging.info(f'Started: {sys.argv}')
32 | 
33 |     crawler = Crawler(ORG, URL, NAME)
34 |     if args.unit_test:
35 |         crawler.unit_test()
36 |     else:
37 |         crawler.run()
38 |         crawler.close()
39 |     logging.info(f'Finished: {sys.argv}')
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 |     sys.exit(0)
45 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ihr/local_hegemony_v6.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.ihr import HegemonyCrawler
 6 | 
 7 | # URL to the API
 8 | URL = 'https://archive.ihr.live/ihr/hegemony/ipv6/local/%Y/%m/%d/ihr_hegemony_ipv6_local_%Y-%m-%d.csv.lz4'
 9 | ORG = 'IHR'
10 | NAME = 'ihr.local_hegemony_v6'
11 | 
12 | 
13 | class Crawler(HegemonyCrawler):
14 |     def __init__(self, organization, url, name):
15 |         super().__init__(organization, url, name, af=6)
16 | 
17 | 
18 | def main() -> None:
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('--unit-test', action='store_true')
21 |     args = parser.parse_args()
22 | 
23 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
24 |     logging.basicConfig(
25 |         format=FORMAT,
26 |         filename='log/' + NAME + '.log',
27 |         level=logging.INFO,
28 |         datefmt='%Y-%m-%d %H:%M:%S'
29 |     )
30 | 
31 |     logging.info(f'Started: {sys.argv}')
32 | 
33 |     crawler = Crawler(ORG, URL, NAME)
34 |     if args.unit_test:
35 |         crawler.unit_test()
36 |     else:
37 |         crawler.run()
38 |         crawler.close()
39 |     logging.info(f'Finished: {sys.argv}')
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 |     sys.exit(0)
45 | 


--------------------------------------------------------------------------------
/iyp/crawlers/inetintel/README.md:
--------------------------------------------------------------------------------
 1 | # Internet Intelligence Lab - Dataset: AS to Organization mapping -- https://github.com/InetIntel/Dataset-AS-to-Organization-Mapping
 2 | 
 3 | The dataset contains historical and current versions of the AS to Organization 
 4 | mapping datasets. A mapping will be created between AS to its sibling ASes.
 5 | 
 6 | ## Graph representation
 7 | 
 8 | ### Sibling ASes
 9 | Connect ASes that are managed by the same organization.
10 | ```cypher
11 | (a:AS {asn: 2497})-[:SIBLING_OF]->(b:AS)
12 | ```
13 | 
14 | ### Sibling organizations
15 | ```cypher
16 | (a:Organization {name: 'NTT Communications Corporation'})-[:SIBLING_OF]->(b:Organization {name: 'NTT Communications (N-BONE)'})
17 | ```
18 | 
19 | ## Dependence
20 | 
21 | This crawler assumes PeeringDB organizations are already present.
22 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ipinfo/README.md:
--------------------------------------------------------------------------------
 1 | # IPinfo -- https://ipinfo.io/
 2 | 
 3 | IPinfo is an IP geolocation service, that provides different kinds of IP databases,
 4 | including a [free tier](https://ipinfo.io/products/free-ip-database) that maps IP ranges
 5 | to countries. We import the free database into IYP.
 6 | 
 7 | ## Graph representation
 8 | 
 9 | Since the IP ranges are not necessarily CIDR aligned, we decompose unaligned ranges
10 | into their CIDR-equivalent blocks. The original range is retained in the `start_ip` and
11 | `end_ip` properties of the `COUNTRY` relationship.
12 | 
13 | A range can also be just a single IP, resulting in /32 or /128 prefixes, which is
14 | intended.
15 | 
16 | ```cypher
17 | (:GeoPrefix {prefix: '203.180.224.0/19'})-[:COUNTRY {start_ip: '203.180.204.28', end_ip: '203.181.102.41'}]->(:Country {country_code: 'JP'})
18 | ```
19 | 
20 | ## Dependence
21 | 
22 | This crawler is not depending on other crawlers.
23 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ipinfo/ip_country.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import gzip
 3 | import json
 4 | import logging
 5 | import os
 6 | import sys
 7 | from ipaddress import ip_address, summarize_address_range
 8 | 
 9 | import requests
10 | 
11 | from iyp import BaseCrawler, set_modification_time_from_last_modified_header
12 | 
13 | ORG = 'IPinfo'
14 | URL = 'https://ipinfo.io/data/free/country.json.gz'
15 | NAME = 'ipinfo.ip_country'
16 | 
17 | IPINFO_TOKEN = ''
18 | if os.path.exists('config.json'):
19 |     IPINFO_TOKEN = json.load(open('config.json', 'r'))['ipinfo']['token']
20 | 
21 | 
22 | class Crawler(BaseCrawler):
23 |     def __init__(self, organization, url, name):
24 |         super().__init__(organization, url, name)
25 |         self.reference['reference_url_info'] = 'https://ipinfo.io/products/free-ip-database'
26 | 
27 |     def run(self):
28 |         """Fetch data and push to IYP."""
29 | 
30 |         headers = {'Authorization': f'Bearer {IPINFO_TOKEN}'}
31 |         req = requests.get(self.reference['reference_url_data'], headers=headers)
32 |         req.raise_for_status()
33 | 
34 |         set_modification_time_from_last_modified_header(self.reference, req)
35 |         rows = gzip.decompress(req.content)
36 | 
37 |         countries = set()
38 |         prefixes = set()
39 |         links = list()
40 | 
41 |         for row in rows.splitlines():
42 |             doc = json.loads(row)
43 |             start, end = ip_address(doc['start_ip']), ip_address(doc['end_ip'])
44 |             for prefix in summarize_address_range(start, end):
45 |                 country_code = doc['country']
46 |                 prefix = prefix.compressed
47 |                 countries.add(country_code)
48 |                 prefixes.add(prefix)
49 |                 links.append({'src_id': prefix, 'dst_id': country_code, 'props': [self.reference, doc]})
50 | 
51 |         country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', countries, all=False)
52 |         prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes, all=False)
53 |         self.iyp.batch_add_node_label(list(prefix_id.values()), 'GeoPrefix')
54 | 
55 |         for link in links:
56 |             link['src_id'] = prefix_id[link['src_id']]
57 |             link['dst_id'] = country_id[link['dst_id']]
58 | 
59 |         self.iyp.batch_add_links('COUNTRY', links)
60 | 
61 |     def unit_test(self):
62 |         return super().unit_test(['COUNTRY'])
63 | 
64 | 
65 | def main() -> None:
66 |     parser = argparse.ArgumentParser()
67 |     parser.add_argument('--unit-test', action='store_true')
68 |     args = parser.parse_args()
69 | 
70 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
71 |     logging.basicConfig(
72 |         format=FORMAT,
73 |         filename='log/' + NAME + '.log',
74 |         level=logging.INFO,
75 |         datefmt='%Y-%m-%d %H:%M:%S',
76 |     )
77 | 
78 |     logging.info(f'Started: {sys.argv}')
79 | 
80 |     crawler = Crawler(ORG, URL, NAME)
81 |     if args.unit_test:
82 |         crawler.unit_test()
83 |     else:
84 |         crawler.run()
85 |         crawler.close()
86 |     logging.info(f'Finished: {sys.argv}')
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     main()
91 |     sys.exit(0)
92 | 


--------------------------------------------------------------------------------
/iyp/crawlers/manrs/README.md:
--------------------------------------------------------------------------------
 1 | # MANRS -- https://www.manrs.org/
 2 | 
 3 | Mutually Agreed Norms for Routing Security (MANRS) is an initiative to improve the security and
 4 | resilience of the Internet’s global routing system. It does this by encouraging those running BGP to
 5 | implement well-established industry best practices and technological solutions that can address the
 6 | most common threats.
 7 | 
 8 | A network operator can become a MANRS member by implementing *Actions* that are further described
 9 | [here](https://www.manrs.org/netops/network-operator-actions/). Currently there are four actions:
10 | 
11 | 1. Filtering: Prevent propagation of incorrect routing information
12 | 1. Anti-spoofing: Prevent traffic with spoofed source IP addresses
13 | 1. Coordination: Facilitate global operational communication and coordination
14 | 1. Global Validation: Facilitate routing information on a global scale
15 | 
16 | IYP contains information about the membership status of networks (in form of AS nodes) and which
17 | actions are implemented by each member. The country assignment provided by MANRS is also used to
18 | enhance the existing AS-to-Country mappings.
19 | 
20 | ## Graph representation
21 | 
22 | ```Cypher
23 | (:AS {asn: 2497})-[:MEMBER_OF]->(:Organization {name: 'MANRS'})
24 | (:AS {asn: 2497})-[:IMPLEMENT]->(:ManrsAction {label: 'MANRS Action 1: Filtering'})
25 | (:AS {asn: 2497})-[:COUNTRY]->(:Country {country_code: 'JP'})
26 | ```
27 | 
28 | Possible labels for ManrsAction nodes:
29 | 
30 | - `MANRS Action 1: Filtering`
31 | - `MANRS Action 2: Anti-spoofing`
32 | - `MANRS Action 3: Coordination`
33 | - `MANRS Action 4: Global Validation`
34 | 
35 | ## Dependence
36 | 
37 | This crawler is not depending on other crawlers.
38 | 


--------------------------------------------------------------------------------
/iyp/crawlers/nro/README.md:
--------------------------------------------------------------------------------
 1 | # Number Resource Organization -- https://www.nro.net/
 2 | 
 3 | The Number Resource Organization (NRO) is the coordinating body for the world’s Regional Internet
 4 | Registries (RIRs). The RIRs manage the distribution of Internet number resources (IP address space
 5 | and Autonomous System Numbers) within their respective regions.
 6 | 
 7 | As part of a joint RIR project to provide consistent and accessible Internet number resource
 8 | statistics the NRO publishes [*Extended Allocation and Assignment
 9 | Reports*](https://www.nro.net/about/rirs/statistics/) (also called *delegated stats*) that contain
10 | information about assigned IP address ranges and AS numbers.
11 | 
12 | Each line of the report is a record that either represents an IP address range or an AS number. The
13 | record has a status and maps to an *opaque ID* that uniquely identifies a single organization.
14 | Finally, the record contains a country code to which the organization belongs.
15 | 
16 | **Note:** If the record is not assigned, the country code is `ZZ`, which will still be inserted into
17 | IYP.
18 | 
19 | ## Graph representation
20 | 
21 | ```Cypher
22 | (:AS {asn: 7494})-[:AVAILABLE {registry: 'apnic'}]->(:OpaqueID {id: 'apnic'})
23 | (:AS {asn: 2497})-[:ASSIGNED {registry: 'apnic'}]->(:OpaqueID {id: 'A91A7381'})
24 | (:AS {asn: 608})-[:RESERVED {registry: 'arin'}]->(:OpaqueID {id: 'arin'})
25 | (:AS {asn: 2497})-[:COUNTRY]->(:Country {country_code: 'JP'})
26 | 
27 | (:RIRPrefix {prefix: '2a03:1dc0::/27'})-[:AVAILABLE {registry: 'ripencc'}]->(:OpaqueID {id: 'ripencc'})
28 | (:RIRPrefix {prefix: '202.0.65.0/24'})-[:ASSIGNED {registry: 'apnic'}]->(:OpaqueID {id: 'A91A7381'})
29 | (:RIRPrefix {prefix: '196.20.32.0/19'})-[:RESERVED {registry: 'afrinic'}]->(:OpaqueID {id: 'afrinic'})
30 | (:RIRPrefix {prefix: '196.20.32.0/19'})-[:COUNTRY]->(:Country {country_code: 'ZZ'})
31 | ```
32 | 
33 | The report also contains `allocated` records that would result in a `ALLOCATED` relationship.
34 | However, this crawler does not add ASes, so if the AS node was not created by another crawler, which
35 | should not happen for `allocated` ASes, the relationship is not created.
36 | 
37 | The IPv4 address ranges in the report are not necessarily aligned with CIDR ranges (prefixes are
38 | represented by the first IP and a *count of hosts*). The crawler decomposes these IP blocks into
39 | valid CIDR prefixes if necessary.
40 | 
41 | ## Dependence
42 | 
43 | This crawler does not create new AS nodes and should be run after crawlers that push many AS nodes
44 | (e.g., `ripe.as_names`).
45 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ooni/facebookmessenger.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import sys
  4 | from collections import defaultdict
  5 | 
  6 | from iyp.crawlers.ooni import OoniCrawler
  7 | 
  8 | ORG = 'OONI'
  9 | URL = 's3://ooni-data-eu-fra/raw/'
 10 | NAME = 'ooni.facebookmessenger'
 11 | 
 12 | label = 'OONI Facebook Messenger Test'
 13 | 
 14 | 
 15 | class Crawler(OoniCrawler):
 16 | 
 17 |     def __init__(self, organization, url, name):
 18 |         super().__init__(organization, url, name, 'facebookmessenger')
 19 |         self.categories = ['unblocked', 'dns_blocking', 'tcp_blocking', 'both_blocked']
 20 | 
 21 |     # Process a single line from the jsonl file and store the results locally
 22 |     def process_one_line(self, one_line):
 23 |         if super().process_one_line(one_line):
 24 |             return
 25 |         result_dns = one_line['test_keys'].get('facebook_dns_blocking', None)
 26 |         result_tcp = one_line['test_keys'].get('facebook_tcp_blocking', None)
 27 |         if result_dns is None or result_tcp is None:
 28 |             self.all_results.pop()
 29 |             return
 30 | 
 31 |         # Using the last result from the base class, add our unique variables
 32 |         self.all_results[-1] = self.all_results[-1] + (result_dns, result_tcp)
 33 | 
 34 |     def batch_add_to_iyp(self):
 35 |         super().batch_add_to_iyp()
 36 | 
 37 |         facebookmessenger_id = self.iyp.get_node('Tag', {'label': label}, create=True)
 38 | 
 39 |         censored_links = list()
 40 | 
 41 |         # Create one link per ASN-country pair.
 42 |         for (asn, country), result_dict in self.all_percentages.items():
 43 |             asn_id = self.node_ids['asn'][asn]
 44 |             props = dict()
 45 |             for category in self.categories:
 46 |                 props[f'percentage_{category}'] = result_dict['percentages'][category]
 47 |                 props[f'count_{category}'] = result_dict['category_counts'][category]
 48 |             props['total_count'] = result_dict['total_count']
 49 |             props['country_code'] = country
 50 |             censored_links.append(
 51 |                 {'src_id': asn_id, 'dst_id': facebookmessenger_id, 'props': [props, self.reference]}
 52 |             )
 53 | 
 54 |         self.iyp.batch_add_links('CENSORED', censored_links)
 55 | 
 56 |     def aggregate_results(self):
 57 |         target_dict = defaultdict(lambda: defaultdict(int))
 58 | 
 59 |         for entry in self.all_results:
 60 |             asn, country, result_dns, result_tcp = entry
 61 |             if not result_dns and not result_tcp:
 62 |                 target_dict[(asn, country)]['unblocked'] += 1
 63 |             elif result_dns and not result_tcp:
 64 |                 target_dict[(asn, country)]['dns_blocking'] += 1
 65 |             elif not result_dns and result_tcp:
 66 |                 target_dict[(asn, country)]['tcp_blocking'] += 1
 67 |             elif result_dns and result_tcp:
 68 |                 target_dict[(asn, country)]['both_blocked'] += 1
 69 | 
 70 |         for (asn, country), counts in target_dict.items():
 71 |             self.all_percentages[(asn, country)] = self.make_result_dict(counts)
 72 | 
 73 |     def unit_test(self):
 74 |         return super().unit_test(['CENSORED'])
 75 | 
 76 | 
 77 | def main() -> None:
 78 |     parser = argparse.ArgumentParser()
 79 |     parser.add_argument('--unit-test', action='store_true')
 80 |     args = parser.parse_args()
 81 | 
 82 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
 83 |     logging.basicConfig(
 84 |         format=FORMAT,
 85 |         filename='log/' + NAME + '.log',
 86 |         level=logging.INFO,
 87 |         datefmt='%Y-%m-%d %H:%M:%S',
 88 |     )
 89 | 
 90 |     logging.info(f'Started: {sys.argv}')
 91 | 
 92 |     crawler = Crawler(ORG, URL, NAME)
 93 |     if args.unit_test:
 94 |         crawler.unit_test()
 95 |     else:
 96 |         crawler.run()
 97 |         crawler.close()
 98 |     logging.info(f'Finished: {sys.argv}')
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     main()
103 |     sys.exit(0)
104 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ooni/httpinvalidrequestline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | from collections import defaultdict
 5 | 
 6 | from iyp.crawlers.ooni import OoniCrawler
 7 | 
 8 | ORG = 'OONI'
 9 | URL = 's3://ooni-data-eu-fra/raw/'
10 | NAME = 'ooni.httpinvalidrequestline'
11 | 
12 | label = 'OONI HTTP Invalid Request Line Test'
13 | 
14 | 
15 | class Crawler(OoniCrawler):
16 | 
17 |     def __init__(self, organization, url, name):
18 |         super().__init__(organization, url, name, 'httpinvalidrequestline')
19 |         self.categories = ['tampering', 'no_tampering']
20 | 
21 |     def process_one_line(self, one_line):
22 |         """Process a single line from the jsonl file and store the results locally."""
23 |         if super().process_one_line(one_line):
24 |             return
25 | 
26 |         tampering = 'tampering' if one_line['test_keys']['tampering'] else 'no_tampering'
27 | 
28 |         # Using the last result from the base class, add our unique variables
29 |         self.all_results[-1] = self.all_results[-1] + (tampering,)
30 | 
31 |     def batch_add_to_iyp(self):
32 |         super().batch_add_to_iyp()
33 | 
34 |         httpinvalidrequestline_id = self.iyp.get_node('Tag', {'label': label}, create=True)
35 | 
36 |         censored_links = list()
37 | 
38 |         # Create one link per ASN-country pair.
39 |         for (asn, country), result_dict in self.all_percentages.items():
40 |             asn_id = self.node_ids['asn'][asn]
41 |             props = dict()
42 |             for category in self.categories:
43 |                 props[f'percentage_{category}'] = result_dict['percentages'][category]
44 |                 props[f'count_{category}'] = result_dict['category_counts'][category]
45 |             props['total_count'] = result_dict['total_count']
46 |             props['country_code'] = country
47 |             censored_links.append(
48 |                 {'src_id': asn_id, 'dst_id': httpinvalidrequestline_id, 'props': [props, self.reference]}
49 |             )
50 | 
51 |         self.iyp.batch_add_links('CENSORED', censored_links)
52 | 
53 |     def aggregate_results(self):
54 |         target_dict = defaultdict(lambda: defaultdict(int))
55 | 
56 |         # Populate the target_dict with counts
57 |         for entry in self.all_results:
58 |             asn, country, tampering = entry
59 |             target_dict[(asn, country)][tampering] += 1
60 | 
61 |         for (asn, country), counts in target_dict.items():
62 |             self.all_percentages[(asn, country)] = self.make_result_dict(counts)
63 | 
64 |     def unit_test(self):
65 |         return super().unit_test(['CENSORED'])
66 | 
67 | 
68 | def main() -> None:
69 |     parser = argparse.ArgumentParser()
70 |     parser.add_argument('--unit-test', action='store_true')
71 |     args = parser.parse_args()
72 | 
73 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
74 |     logging.basicConfig(
75 |         format=FORMAT,
76 |         filename='log/' + NAME + '.log',
77 |         level=logging.INFO,
78 |         datefmt='%Y-%m-%d %H:%M:%S',
79 |     )
80 | 
81 |     logging.info(f'Started: {sys.argv}')
82 | 
83 |     crawler = Crawler(ORG, URL, NAME)
84 |     if args.unit_test:
85 |         crawler.unit_test()
86 |     else:
87 |         crawler.run()
88 |         crawler.close()
89 |     logging.info(f'Finished: {sys.argv}')
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     main()
94 |     sys.exit(0)
95 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ooni/osignal.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | from collections import defaultdict
 5 | 
 6 | from iyp.crawlers.ooni import OoniCrawler
 7 | 
 8 | ORG = 'OONI'
 9 | URL = 's3://ooni-data-eu-fra/raw/'
10 | # This crawler is not called 'signal' to prevent name collision with Python's built-in
11 | # module.
12 | NAME = 'ooni.osignal'
13 | 
14 | label = 'OONI Signal Test'
15 | 
16 | 
17 | class Crawler(OoniCrawler):
18 | 
19 |     def __init__(self, organization, url, name):
20 |         super().__init__(organization, url, name, 'signal')
21 |         self.categories = ['ok', 'blocked']
22 | 
23 |     def process_one_line(self, one_line):
24 |         """Process a single line from the jsonl file and store the results locally."""
25 |         if super().process_one_line(one_line):
26 |             return
27 |         signal_backend_status = one_line['test_keys']['signal_backend_status']
28 |         if signal_backend_status is None:
29 |             self.all_results.pop()
30 |             return
31 | 
32 |         # Using the last result from the base class, add our unique variables
33 |         self.all_results[-1] = self.all_results[-1] + (signal_backend_status,)
34 | 
35 |     def batch_add_to_iyp(self):
36 |         super().batch_add_to_iyp()
37 | 
38 |         signal_id = self.iyp.get_node('Tag', {'label': label}, create=True)
39 | 
40 |         censored_links = list()
41 | 
42 |         # Create one link per ASN-country pair.
43 |         for (asn, country), result_dict in self.all_percentages.items():
44 |             asn_id = self.node_ids['asn'][asn]
45 |             props = dict()
46 |             for category in self.categories:
47 |                 props[f'percentage_{category}'] = result_dict['percentages'][category]
48 |                 props[f'count_{category}'] = result_dict['category_counts'][category]
49 |             props['total_count'] = result_dict['total_count']
50 |             props['country_code'] = country
51 |             censored_links.append(
52 |                 {'src_id': asn_id, 'dst_id': signal_id, 'props': [props, self.reference]}
53 |             )
54 | 
55 |         self.iyp.batch_add_links('CENSORED', censored_links)
56 | 
57 |     def aggregate_results(self):
58 |         target_dict = defaultdict(lambda: defaultdict(int))
59 | 
60 |         # Populate the target_dict with counts
61 |         for entry in self.all_results:
62 |             asn, country, result = entry
63 |             target_dict[(asn, country)][result] += 1
64 | 
65 |         for (asn, country), counts in target_dict.items():
66 |             self.all_percentages[(asn, country)] = self.make_result_dict(counts)
67 | 
68 |     def unit_test(self):
69 |         return super().unit_test(['CENSORED'])
70 | 
71 | 
72 | def main() -> None:
73 |     parser = argparse.ArgumentParser()
74 |     parser.add_argument('--unit-test', action='store_true')
75 |     args = parser.parse_args()
76 | 
77 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
78 |     logging.basicConfig(
79 |         format=FORMAT,
80 |         filename='log/' + NAME + '.log',
81 |         level=logging.INFO,
82 |         datefmt='%Y-%m-%d %H:%M:%S',
83 |     )
84 | 
85 |     logging.info(f'Started: {sys.argv}')
86 | 
87 |     crawler = Crawler(ORG, URL, NAME)
88 |     if args.unit_test:
89 |         crawler.unit_test()
90 |     else:
91 |         crawler.run()
92 |         crawler.close()
93 |     logging.info(f'Finished: {sys.argv}')
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     main()
98 |     sys.exit(0)
99 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ooni/psiphon.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import sys
  4 | from collections import defaultdict
  5 | 
  6 | from iyp.crawlers.ooni import OoniCrawler
  7 | 
  8 | ORG = 'OONI'
  9 | URL = 's3://ooni-data-eu-fra/raw/'
 10 | NAME = 'ooni.psiphon'
 11 | 
 12 | label = 'OONI Psiphon Test'
 13 | 
 14 | 
 15 | class Crawler(OoniCrawler):
 16 | 
 17 |     def __init__(self, organization, url, name):
 18 |         super().__init__(organization, url, name, 'psiphon')
 19 |         self.categories = [
 20 |             'bootstrapping_error',
 21 |             'usage_error',
 22 |             'working',
 23 |             'invalid',
 24 |         ]
 25 | 
 26 |     def process_one_line(self, one_line):
 27 |         """Process a single line from the jsonl file and store the results locally."""
 28 |         if super().process_one_line(one_line):
 29 |             return
 30 | 
 31 |         if 'bootstrap_time' not in one_line['test_keys']:
 32 |             self.all_results.pop()
 33 |             return
 34 | 
 35 |         bootstrap_time = one_line['test_keys']['bootstrap_time']
 36 |         failure = one_line['test_keys']['failure']
 37 | 
 38 |         if bootstrap_time == 0 and failure is not None:
 39 |             result = 'bootstrapping_error'
 40 |         elif bootstrap_time > 0 and failure is not None:
 41 |             result = 'usage_error'
 42 |         elif bootstrap_time > 0 and failure is None:
 43 |             result = 'working'
 44 |         elif bootstrap_time == 0 and failure is None:
 45 |             result = 'invalid'
 46 | 
 47 |         # Using the last result from the base class, add our unique variables
 48 |         self.all_results[-1] = self.all_results[-1] + (result,)
 49 | 
 50 |     def batch_add_to_iyp(self):
 51 |         super().batch_add_to_iyp()
 52 | 
 53 |         psiphon_id = self.iyp.get_node('Tag', {'label': label}, create=True)
 54 | 
 55 |         censored_links = list()
 56 | 
 57 |         # Create one link per ASN-country pair.
 58 |         for (asn, country), result_dict in self.all_percentages.items():
 59 |             asn_id = self.node_ids['asn'][asn]
 60 |             props = dict()
 61 |             for category in self.categories:
 62 |                 props[f'percentage_{category}'] = result_dict['percentages'][category]
 63 |                 props[f'count_{category}'] = result_dict['category_counts'][category]
 64 |             props['total_count'] = result_dict['total_count']
 65 |             props['country_code'] = country
 66 |             censored_links.append(
 67 |                 {'src_id': asn_id, 'dst_id': psiphon_id, 'props': [props, self.reference]}
 68 |             )
 69 | 
 70 |         self.iyp.batch_add_links('CENSORED', censored_links)
 71 | 
 72 |     def aggregate_results(self):
 73 |         target_dict = defaultdict(lambda: defaultdict(int))
 74 | 
 75 |         # Populate the target_dict with counts
 76 |         for entry in self.all_results:
 77 |             asn, country, result = entry
 78 |             target_dict[(asn, country)][result] += 1
 79 | 
 80 |         for (asn, country), counts in target_dict.items():
 81 |             self.all_percentages[(asn, country)] = self.make_result_dict(counts)
 82 | 
 83 |     def unit_test(self):
 84 |         return super().unit_test(['CENSORED'])
 85 | 
 86 | 
 87 | def main() -> None:
 88 |     parser = argparse.ArgumentParser()
 89 |     parser.add_argument('--unit-test', action='store_true')
 90 |     args = parser.parse_args()
 91 | 
 92 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
 93 |     logging.basicConfig(
 94 |         format=FORMAT,
 95 |         filename='log/' + NAME + '.log',
 96 |         level=logging.INFO,
 97 |         datefmt='%Y-%m-%d %H:%M:%S',
 98 |     )
 99 | 
100 |     logging.info(f'Started: {sys.argv}')
101 | 
102 |     crawler = Crawler(ORG, URL, NAME)
103 |     if args.unit_test:
104 |         crawler.unit_test()
105 |     else:
106 |         crawler.run()
107 |         crawler.close()
108 |     logging.info(f'Finished: {sys.argv}')
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     main()
113 |     sys.exit(0)
114 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ooni/riseupvpn.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import sys
  4 | from collections import defaultdict
  5 | 
  6 | from iyp.crawlers.ooni import OoniCrawler
  7 | 
  8 | ORG = 'OONI'
  9 | URL = 's3://ooni-data-eu-fra/raw/'
 10 | NAME = 'ooni.riseupvpn'
 11 | 
 12 | label = 'OONI RiseupVPN Test'
 13 | 
 14 | 
 15 | class Crawler(OoniCrawler):
 16 | 
 17 |     def __init__(self, organization, url, name):
 18 |         super().__init__(organization, url, name, 'riseupvpn')
 19 |         self.categories = ['ok', 'failure']
 20 | 
 21 |     def process_one_line(self, one_line):
 22 |         """Process a single line from the jsonl file and store the results locally."""
 23 |         if super().process_one_line(one_line):
 24 |             return
 25 | 
 26 |         api_failures = one_line['test_keys'].get('api_failures')
 27 |         ca_cert_status = one_line['test_keys']['ca_cert_status']
 28 | 
 29 |         if not api_failures and ca_cert_status:
 30 |             result = 'ok'
 31 |         else:
 32 |             result = 'failure'
 33 | 
 34 |         # Using the last result from the base class, add our unique variables
 35 |         self.all_results[-1] = self.all_results[-1] + (result,)
 36 | 
 37 |     def batch_add_to_iyp(self):
 38 |         super().batch_add_to_iyp()
 39 | 
 40 |         riseupvpn_id = self.iyp.get_node('Tag', {'label': label}, create=True)
 41 | 
 42 |         censored_links = list()
 43 | 
 44 |         # Create one link per ASN-country pair.
 45 |         for (asn, country), result_dict in self.all_percentages.items():
 46 |             asn_id = self.node_ids['asn'][asn]
 47 |             props = dict()
 48 |             for category in self.categories:
 49 |                 props[f'percentage_{category}'] = result_dict['percentages'][category]
 50 |                 props[f'count_{category}'] = result_dict['category_counts'][category]
 51 |             props['total_count'] = result_dict['total_count']
 52 |             props['country_code'] = country
 53 |             censored_links.append(
 54 |                 {'src_id': asn_id, 'dst_id': riseupvpn_id, 'props': [props, self.reference]}
 55 |             )
 56 | 
 57 |         self.iyp.batch_add_links('CENSORED', censored_links)
 58 | 
 59 |     def aggregate_results(self):
 60 |         target_dict = defaultdict(lambda: defaultdict(int))
 61 | 
 62 |         # Populate the target_dict with counts
 63 |         for entry in self.all_results:
 64 |             asn, country, result = entry
 65 |             target_dict[(asn, country)][result] += 1
 66 | 
 67 |         for (asn, country), counts in target_dict.items():
 68 |             self.all_percentages[(asn, country)] = self.make_result_dict(counts)
 69 | 
 70 |     def unit_test(self):
 71 |         return super().unit_test(['CENSORED'])
 72 | 
 73 | 
 74 | def main() -> None:
 75 |     parser = argparse.ArgumentParser()
 76 |     parser.add_argument('--unit-test', action='store_true')
 77 |     args = parser.parse_args()
 78 | 
 79 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
 80 |     logging.basicConfig(
 81 |         format=FORMAT,
 82 |         filename='log/' + NAME + '.log',
 83 |         level=logging.INFO,
 84 |         datefmt='%Y-%m-%d %H:%M:%S',
 85 |     )
 86 | 
 87 |     logging.info(f'Started: {sys.argv}')
 88 | 
 89 |     crawler = Crawler(ORG, URL, NAME)
 90 |     if args.unit_test:
 91 |         crawler.unit_test()
 92 |     else:
 93 |         crawler.run()
 94 |         crawler.close()
 95 |     logging.info(f'Finished: {sys.argv}')
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     main()
100 |     sys.exit(0)
101 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ooni/torsf.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | from collections import defaultdict
 5 | 
 6 | from iyp.crawlers.ooni import OoniCrawler
 7 | 
 8 | ORG = 'OONI'
 9 | URL = 's3://ooni-data-eu-fra/raw/'
10 | NAME = 'ooni.torsf'
11 | 
12 | label = 'OONI Tor Snowflake Test'
13 | 
14 | 
15 | class Crawler(OoniCrawler):
16 | 
17 |     def __init__(self, organization, url, name):
18 |         super().__init__(organization, url, name, 'torsf')
19 |         self.categories = ['ok', 'failure']
20 | 
21 |     def process_one_line(self, one_line):
22 |         """Process a single line from the JSONL file."""
23 |         if super().process_one_line(one_line):
24 |             return
25 |         if 'success' not in one_line['test_keys']:
26 |             self.all_results.pop()
27 |             return
28 |         result = 'ok' if one_line['test_keys']['success'] else 'failure'
29 | 
30 |         # Update the last entry in all_results with the new test-specific data
31 |         self.all_results[-1] = self.all_results[-1] + (result,)
32 | 
33 |     def batch_add_to_iyp(self):
34 |         super().batch_add_to_iyp()
35 | 
36 |         torsf_id = self.iyp.get_node('Tag', {'label': label}, create=True)
37 | 
38 |         censored_links = list()
39 | 
40 |         # Create one link per ASN-country pair.
41 |         for (asn, country), result_dict in self.all_percentages.items():
42 |             asn_id = self.node_ids['asn'][asn]
43 |             props = dict()
44 |             for category in self.categories:
45 |                 props[f'percentage_{category}'] = result_dict['percentages'][category]
46 |                 props[f'count_{category}'] = result_dict['category_counts'][category]
47 |             props['total_count'] = result_dict['total_count']
48 |             props['country_code'] = country
49 |             censored_links.append(
50 |                 {'src_id': asn_id, 'dst_id': torsf_id, 'props': [props, self.reference]}
51 |             )
52 | 
53 |         self.iyp.batch_add_links('CENSORED', censored_links)
54 | 
55 |     def aggregate_results(self):
56 |         target_dict = defaultdict(lambda: defaultdict(int))
57 | 
58 |         # Populate the target_dict with counts
59 |         for entry in self.all_results:
60 |             asn, country, result = entry
61 |             target_dict[(asn, country)][result] += 1
62 | 
63 |         for (asn, country), counts in target_dict.items():
64 |             self.all_percentages[(asn, country)] = self.make_result_dict(counts)
65 | 
66 |     def unit_test(self):
67 |         return super().unit_test(['CENSORED'])
68 | 
69 | 
70 | def main() -> None:
71 |     parser = argparse.ArgumentParser()
72 |     parser.add_argument('--unit-test', action='store_true')
73 |     args = parser.parse_args()
74 | 
75 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
76 |     logging.basicConfig(
77 |         format=FORMAT,
78 |         filename='log/' + NAME + '.log',
79 |         level=logging.INFO,
80 |         datefmt='%Y-%m-%d %H:%M:%S',
81 |     )
82 | 
83 |     logging.info(f'Started: {sys.argv}')
84 | 
85 |     crawler = Crawler(ORG, URL, NAME)
86 |     if args.unit_test:
87 |         crawler.unit_test()
88 |     else:
89 |         crawler.run()
90 |         crawler.close()
91 |     logging.info(f'Finished: {sys.argv}')
92 | 
93 | 
94 | if __name__ == '__main__':
95 |     main()
96 |     sys.exit(0)
97 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ooni/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InternetHealthReport/internet-yellow-pages/c79ced8783c73f282d273a886f4aa5cf8018fdd9/iyp/crawlers/ooni/utils/__init__.py


--------------------------------------------------------------------------------
/iyp/crawlers/ooni/utils/grabber.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import gzip
 3 | import json
 4 | import logging
 5 | import os
 6 | import shutil
 7 | from multiprocessing import Pool
 8 | 
 9 | import boto3
10 | import botocore
11 | 
12 | # Global variable required for multiprocessing.
13 | s3 = None
14 | 
15 | PARALLEL_DOWNLOADS = 4
16 | if os.path.exists('config.json'):
17 |     config = json.load(open('config.json', 'r'))
18 |     PARALLEL_DOWNLOADS = config['ooni']['parallel_downloads']
19 | 
20 | 
21 | def process(params: tuple):
22 |     """Download and extract a single file.
23 | 
24 |     Args:
25 |         params (tuple): Object key and output file path.
26 |     """
27 |     key, dest_file = params
28 |     # Download the file
29 |     try:
30 |         s3.download_file(key, dest_file)
31 |     except Exception as e:
32 |         logging.error(f'Error downloading {key}: {e}')
33 |         return
34 | 
35 |     # Extract the .gz file
36 |     try:
37 |         extracted_file = dest_file.rstrip('.gz')
38 |         with gzip.open(dest_file, 'rb') as f_in, open(extracted_file, 'wb') as f_out:
39 |             shutil.copyfileobj(f_in, f_out)
40 |         # Delete the .gz file
41 |         os.remove(dest_file)
42 |     except Exception as e:
43 |         logging.error(f'Error extracting {key}: {e}')
44 | 
45 | 
46 | def download_and_extract(repo: str, tmpdir: str, test_name: str):
47 |     """Download the last 7 days of data for the specified test from an S3 bucket into a
48 |     temporary directory.
49 | 
50 |     Args:
51 |         repo (str): S3 bucket
52 |         tmpdir (str): Output directory
53 |         test_name (str): Test name
54 |     """
55 |     global s3
56 |     # Create an anonymous session
57 |     s3 = boto3.resource(
58 |         's3',
59 |         region_name='ap-northeast-1',
60 |         config=botocore.client.Config(
61 |             signature_version=botocore.UNSIGNED
62 |         )
63 |     ).Bucket(repo)
64 | 
65 |     # Get the dates for the last 7 days.
66 |     dates = [
67 |         (
68 |             datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=i)
69 |         ).strftime('%Y%m%d')
70 |         for i in range(7)
71 |     ]
72 | 
73 |     files = list()
74 | 
75 |     logging.info('Fetching object list...')
76 |     # For each day, grab the objects from the S3 bucket.
77 |     for date in dates:
78 |         date_objects = s3.objects.filter(Prefix=f'raw/{date}/').all()
79 |         # Filter for objects from the requested test and only fetch JSONL files.
80 |         for object_summary in date_objects:
81 |             key = object_summary.key
82 |             key_split = key.split('/')
83 |             if len(key_split) != 6:
84 |                 logging.warning(f'Malformed key: {key}')
85 |                 continue
86 |             test = key_split[4]
87 |             object_name = key_split[5]
88 |             if test != test_name or not object_name.endswith('.jsonl.gz'):
89 |                 continue
90 |             dest_file = os.path.join(tmpdir, object_name)
91 |             files.append((key, dest_file))
92 | 
93 |     logging.info(f'Fetching {len(files)} objects with {PARALLEL_DOWNLOADS} processes in parallel...')
94 |     # Download and extract the files.
95 |     with Pool(PARALLEL_DOWNLOADS) as p:
96 |         p.map(process, files)
97 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ooni/vanillator.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | from collections import defaultdict
 5 | 
 6 | from iyp.crawlers.ooni import OoniCrawler
 7 | 
 8 | ORG = 'OONI'
 9 | URL = 's3://ooni-data-eu-fra/raw/'
10 | NAME = 'ooni.vanillator'
11 | 
12 | label = 'OONI Vanilla Tor Test'
13 | 
14 | 
15 | class Crawler(OoniCrawler):
16 | 
17 |     def __init__(self, organization, url, name):
18 |         super().__init__(organization, url, name, 'vanillator')
19 |         self.categories = ['ok', 'failure']
20 | 
21 |     def process_one_line(self, one_line):
22 |         """Process a single line from the JSONL file."""
23 |         if super().process_one_line(one_line):
24 |             return
25 |         result = 'ok' if one_line['test_keys']['success'] else 'failure'
26 | 
27 |         # Update the last entry in all_results with the new test-specific data
28 |         self.all_results[-1] = self.all_results[-1] + (result,)
29 | 
30 |     def batch_add_to_iyp(self):
31 |         super().batch_add_to_iyp()
32 | 
33 |         vanillator_id = self.iyp.get_node('Tag', {'label': label}, create=True)
34 | 
35 |         censored_links = list()
36 | 
37 |         # Create one link per ASN-country pair.
38 |         for (asn, country), result_dict in self.all_percentages.items():
39 |             asn_id = self.node_ids['asn'][asn]
40 |             props = dict()
41 |             for category in self.categories:
42 |                 props[f'percentage_{category}'] = result_dict['percentages'][category]
43 |                 props[f'count_{category}'] = result_dict['category_counts'][category]
44 |             props['total_count'] = result_dict['total_count']
45 |             props['country_code'] = country
46 |             censored_links.append(
47 |                 {'src_id': asn_id, 'dst_id': vanillator_id, 'props': [props, self.reference]}
48 |             )
49 | 
50 |         self.iyp.batch_add_links('CENSORED', censored_links)
51 | 
52 |     def aggregate_results(self):
53 |         target_dict = defaultdict(lambda: defaultdict(int))
54 | 
55 |         # Populate the target_dict with counts
56 |         for entry in self.all_results:
57 |             asn, country, result = entry
58 |             target_dict[(asn, country)][result] += 1
59 | 
60 |         for (asn, country), counts in target_dict.items():
61 |             self.all_percentages[(asn, country)] = self.make_result_dict(counts)
62 | 
63 |     def unit_test(self):
64 |         return super().unit_test(['CENSORED'])
65 | 
66 | 
67 | def main() -> None:
68 |     parser = argparse.ArgumentParser()
69 |     parser.add_argument('--unit-test', action='store_true')
70 |     args = parser.parse_args()
71 | 
72 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
73 |     logging.basicConfig(
74 |         format=FORMAT,
75 |         filename='log/' + NAME + '.log',
76 |         level=logging.INFO,
77 |         datefmt='%Y-%m-%d %H:%M:%S',
78 |     )
79 | 
80 |     logging.info(f'Started: {sys.argv}')
81 | 
82 |     crawler = Crawler(ORG, URL, NAME)
83 |     if args.unit_test:
84 |         crawler.unit_test()
85 |     else:
86 |         crawler.run()
87 |         crawler.close()
88 |     logging.info(f'Finished: {sys.argv}')
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     main()
93 |     sys.exit(0)
94 | 


--------------------------------------------------------------------------------
/iyp/crawlers/openintel/README.md:
--------------------------------------------------------------------------------
 1 | # OpenINTEL -- https://www.openintel.nl/
 2 | 
 3 | The OpenINTEL measurement platform captures daily snapshots of the state of large parts of the
 4 | global Domain Name System (DNS) by running a number of forward and reverse DNS measurements.
 5 | 
 6 | While OpenINTEL runs measurements to a variety of domain names, IYP currently only fetches data for
 7 | the [Tranco top 1 million list](https://data.openintel.nl/data/tranco1m/) and the CISCO Umbrella
 8 | top 1 million list since it combines rankings.
 9 | IYP also get the list of authoritative names servers seen by OpenINTEL.
10 | 
11 | A crawler of mail servers is also implemented but not used as it creates a very large number
12 | of links and this dataset is currently not requested/needed by anyone.
13 | 
14 | ## Graph representation
15 | 
16 | IP resolution for  popular host names:
17 | 
18 | ```Cypher
19 | (:HostName {name: 'google.com'})-[:RESOLVES_TO]->(:IP {ip: '142.250.179.142'})
20 | ```
21 | 
22 | IP resolution of authoritative name servers:
23 | 
24 | ```Cypher
25 | (:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})-[:RESOLVES_TO]->(:IP {ip: '216.239.32.10'})
26 | (:IP {ip: '216.239.32.10'})-[:SERVE]->(:Service {name: 'DNS'})
27 | ```
28 | 
29 | Domain names managed by name servers:
30 | 
31 | ```Cypher
32 | (:DomainName {name: 'google.com'})-[:MANAGED_BY]->(:HostName:AuthoritativeNameServer {name: 'ns1.google.com'})
33 | ```
34 | 
35 | ## Dependence
36 | 
37 | This crawler is not depending on other crawlers.
38 | 


--------------------------------------------------------------------------------
/iyp/crawlers/openintel/crux.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.openintel import OpenIntelCrawler
 6 | 
 7 | URL = 'https://data.openintel.nl/data/'
 8 | ORG = 'OpenINTEL'
 9 | NAME = 'openintel.crux'
10 | 
11 | DATASET = 'crux'
12 | 
13 | 
14 | class Crawler(OpenIntelCrawler):
15 |     def __init__(self, organization, url, name):
16 |         super().__init__(organization, url, name, DATASET)
17 | 
18 | 
19 | def main() -> None:
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('--unit-test', action='store_true')
22 |     args = parser.parse_args()
23 | 
24 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
25 |     logging.basicConfig(
26 |         format=FORMAT,
27 |         filename='log/' + NAME + '.log',
28 |         level=logging.INFO,
29 |         datefmt='%Y-%m-%d %H:%M:%S'
30 |     )
31 | 
32 |     logging.info(f'Started: {sys.argv}')
33 | 
34 |     crawler = Crawler(ORG, URL, NAME)
35 |     if args.unit_test:
36 |         crawler.unit_test()
37 |     else:
38 |         crawler.run()
39 |         crawler.close()
40 |     logging.info(f'Finished: {sys.argv}')
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 |     sys.exit(0)
46 | 


--------------------------------------------------------------------------------
/iyp/crawlers/openintel/dnsgraph_crux.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.openintel import DnsgraphCrawler
 6 | 
 7 | URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/CRUX'
 8 | ORG = 'OpenINTEL'
 9 | NAME = 'openintel.dnsgraph_crux'
10 | 
11 | 
12 | class Crawler(DnsgraphCrawler):
13 |     def __init__(self, organization, url, name):
14 |         super().__init__(organization, url, name)
15 | 
16 | 
17 | def main() -> None:
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('--unit-test', action='store_true')
20 |     args = parser.parse_args()
21 | 
22 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
23 |     logging.basicConfig(
24 |         format=FORMAT,
25 |         filename='log/' + NAME + '.log',
26 |         level=logging.INFO,
27 |         datefmt='%Y-%m-%d %H:%M:%S'
28 |     )
29 | 
30 |     logging.info(f'Started: {sys.argv}')
31 | 
32 |     crawler = Crawler(ORG, URL, NAME)
33 |     if args.unit_test:
34 |         crawler.unit_test()
35 |     else:
36 |         crawler.run()
37 |         crawler.close()
38 |     logging.info(f'Finished: {sys.argv}')
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 |     sys.exit(0)
44 | 


--------------------------------------------------------------------------------
/iyp/crawlers/openintel/dnsgraph_jp.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.openintel import DnsgraphCrawler
 6 | 
 7 | URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/JP'
 8 | ORG = 'OpenINTEL'
 9 | NAME = 'openintel.dnsgraph_jp'
10 | 
11 | 
12 | class Crawler(DnsgraphCrawler):
13 |     def __init__(self, organization, url, name):
14 |         super().__init__(organization, url, name)
15 | 
16 | 
17 | def main() -> None:
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('--unit-test', action='store_true')
20 |     args = parser.parse_args()
21 | 
22 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
23 |     logging.basicConfig(
24 |         format=FORMAT,
25 |         filename='log/' + NAME + '.log',
26 |         level=logging.INFO,
27 |         datefmt='%Y-%m-%d %H:%M:%S'
28 |     )
29 | 
30 |     logging.info(f'Started: {sys.argv}')
31 | 
32 |     crawler = Crawler(ORG, URL, NAME)
33 |     if args.unit_test:
34 |         crawler.unit_test()
35 |     else:
36 |         crawler.run()
37 |         crawler.close()
38 |     logging.info(f'Finished: {sys.argv}')
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 |     sys.exit(0)
44 | 


--------------------------------------------------------------------------------
/iyp/crawlers/openintel/dnsgraph_nl.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.openintel import DnsgraphCrawler
 6 | 
 7 | URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/NL'
 8 | ORG = 'OpenINTEL'
 9 | NAME = 'openintel.dnsgraph_nl'
10 | 
11 | 
12 | class Crawler(DnsgraphCrawler):
13 |     def __init__(self, organization, url, name):
14 |         super().__init__(organization, url, name)
15 | 
16 | 
17 | def main() -> None:
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('--unit-test', action='store_true')
20 |     args = parser.parse_args()
21 | 
22 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
23 |     logging.basicConfig(
24 |         format=FORMAT,
25 |         filename='log/' + NAME + '.log',
26 |         level=logging.INFO,
27 |         datefmt='%Y-%m-%d %H:%M:%S'
28 |     )
29 | 
30 |     logging.info(f'Started: {sys.argv}')
31 | 
32 |     crawler = Crawler(ORG, URL, NAME)
33 |     if args.unit_test:
34 |         crawler.unit_test()
35 |     else:
36 |         crawler.run()
37 |         crawler.close()
38 |     logging.info(f'Finished: {sys.argv}')
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 |     sys.exit(0)
44 | 


--------------------------------------------------------------------------------
/iyp/crawlers/openintel/dnsgraph_rdns.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.openintel import DnsgraphCrawler
 6 | 
 7 | URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/RDNS'
 8 | ORG = 'OpenINTEL'
 9 | NAME = 'openintel.dnsgraph_rdns'
10 | 
11 | 
12 | class Crawler(DnsgraphCrawler):
13 |     def __init__(self, organization, url, name):
14 |         super().__init__(organization, url, name)
15 | 
16 | 
17 | def main() -> None:
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('--unit-test', action='store_true')
20 |     args = parser.parse_args()
21 | 
22 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
23 |     logging.basicConfig(
24 |         format=FORMAT,
25 |         filename='log/' + NAME + '.log',
26 |         level=logging.INFO,
27 |         datefmt='%Y-%m-%d %H:%M:%S'
28 |     )
29 | 
30 |     logging.info(f'Started: {sys.argv}')
31 | 
32 |     crawler = Crawler(ORG, URL, NAME)
33 |     if args.unit_test:
34 |         crawler.unit_test()
35 |     else:
36 |         crawler.run()
37 |         crawler.close()
38 |     logging.info(f'Finished: {sys.argv}')
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 |     sys.exit(0)
44 | 


--------------------------------------------------------------------------------
/iyp/crawlers/openintel/infra_mx.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.openintel import OpenIntelCrawler
 6 | 
 7 | URL = 'https://data.openintel.nl'
 8 | ORG = 'OpenINTEL'
 9 | NAME = 'openintel.infra_mx'
10 | 
11 | DATASET = 'infra:mx'
12 | NODE_TYPE = 'MailServer'
13 | 
14 | 
15 | class Crawler(OpenIntelCrawler):
16 |     def __init__(self, organization, url, name):
17 |         super().__init__(organization, url, name, DATASET, NODE_TYPE)
18 | 
19 | 
20 | def main() -> None:
21 | 
22 |     ############################################
23 |     # This crawler is not working the NODE_TYPE argument has been deprecated
24 |     ############################################
25 |     return
26 | 
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument('--unit-test', action='store_true')
29 |     args = parser.parse_args()
30 | 
31 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
32 |     logging.basicConfig(
33 |         format=FORMAT,
34 |         filename='log/' + NAME + '.log',
35 |         level=logging.INFO,
36 |         datefmt='%Y-%m-%d %H:%M:%S'
37 |     )
38 | 
39 |     logging.info(f'Started: {sys.argv}')
40 | 
41 |     crawler = Crawler(ORG, URL, NAME)
42 |     if args.unit_test:
43 |         crawler.unit_test()
44 |     else:
45 |         crawler.run()
46 |         crawler.close()
47 |     logging.info(f'Finished: {sys.argv}')
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     main()
52 |     sys.exit(0)
53 | 


--------------------------------------------------------------------------------
/iyp/crawlers/openintel/infra_ns.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.openintel import OpenIntelCrawler
 6 | 
 7 | URL = 'https://data.openintel.nl'
 8 | ORG = 'OpenINTEL'
 9 | NAME = 'openintel.infra_ns'
10 | 
11 | DATASET = 'infra:ns'
12 | 
13 | 
14 | class Crawler(OpenIntelCrawler):
15 |     def __init__(self, organization, url, name):
16 |         super().__init__(organization, url, name, DATASET)
17 | 
18 | 
19 | def main() -> None:
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('--unit-test', action='store_true')
22 |     args = parser.parse_args()
23 | 
24 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
25 |     logging.basicConfig(
26 |         format=FORMAT,
27 |         filename='log/' + NAME + '.log',
28 |         level=logging.INFO,
29 |         datefmt='%Y-%m-%d %H:%M:%S'
30 |     )
31 | 
32 |     logging.info(f'Started: {sys.argv}')
33 | 
34 |     crawler = Crawler(ORG, URL, NAME)
35 |     if args.unit_test:
36 |         crawler.unit_test()
37 |     else:
38 |         crawler.run()
39 |         crawler.close()
40 |     logging.info(f'Finished: {sys.argv}')
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 |     sys.exit(0)
46 | 


--------------------------------------------------------------------------------
/iyp/crawlers/openintel/tranco1m.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.openintel import OpenIntelCrawler
 6 | 
 7 | URL = 'https://data.openintel.nl/data/tranco1m/'
 8 | ORG = 'OpenINTEL'
 9 | NAME = 'openintel.tranco1m'
10 | 
11 | DATASET = 'tranco'
12 | 
13 | 
14 | class Crawler(OpenIntelCrawler):
15 |     def __init__(self, organization, url, name):
16 |         super().__init__(organization, url, name, DATASET)
17 | 
18 | 
19 | def main() -> None:
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('--unit-test', action='store_true')
22 |     args = parser.parse_args()
23 | 
24 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
25 |     logging.basicConfig(
26 |         format=FORMAT,
27 |         filename='log/' + NAME + '.log',
28 |         level=logging.INFO,
29 |         datefmt='%Y-%m-%d %H:%M:%S'
30 |     )
31 | 
32 |     logging.info(f'Started: {sys.argv}')
33 | 
34 |     crawler = Crawler(ORG, URL, NAME)
35 |     if args.unit_test:
36 |         crawler.unit_test()
37 |     else:
38 |         crawler.run()
39 |         crawler.close()
40 |     logging.info(f'Finished: {sys.argv}')
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 |     sys.exit(0)
46 | 


--------------------------------------------------------------------------------
/iyp/crawlers/openintel/umbrella1m.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.openintel import OpenIntelCrawler
 6 | 
 7 | URL = 'https://data.openintel.nl/data/umbrella1m/'
 8 | ORG = 'OpenINTEL'
 9 | NAME = 'openintel.umbrella1m'
10 | 
11 | DATASET = 'umbrella'
12 | 
13 | 
14 | class Crawler(OpenIntelCrawler):
15 |     def __init__(self, organization, url, name):
16 |         super().__init__(organization, url, name, DATASET)
17 | 
18 | 
19 | def main() -> None:
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('--unit-test', action='store_true')
22 |     args = parser.parse_args()
23 | 
24 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
25 |     logging.basicConfig(
26 |         format=FORMAT,
27 |         filename='log/' + NAME + '.log',
28 |         level=logging.INFO,
29 |         datefmt='%Y-%m-%d %H:%M:%S'
30 |     )
31 | 
32 |     logging.info(f'Started: {sys.argv}')
33 | 
34 |     crawler = Crawler(ORG, URL, NAME)
35 |     if args.unit_test:
36 |         crawler.unit_test()
37 |     else:
38 |         crawler.run()
39 |         crawler.close()
40 |     logging.info(f'Finished: {sys.argv}')
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 |     sys.exit(0)
46 | 


--------------------------------------------------------------------------------
/iyp/crawlers/pch/README.md:
--------------------------------------------------------------------------------
 1 | # Packet Clearing House -- https://www.pch.net/
 2 | 
 3 | Packet Clearing House (PCH) is an international organization responsible for providing operational
 4 | support and security to critical Internet infrastructure, including Internet exchange points and the
 5 | core of the domain name system.
 6 | 
 7 | PCH operates route collectors at more than 100 Internet Exchange Points around the world.
 8 | [Data](https://www.pch.net/resources/Routing_Data/) from these route collectors is made available
 9 | publicly for the benefit of the Internet's operational and research communities.
10 | 
11 | IYP fetches the *Daily snapshots of the results of "show ip bgp" on PCH route collectors*, which
12 | indicate the state of the routing table on PCH route collectors at the moment in time that the
13 | snapshot is taken.
14 | 
15 | IYP uses the announced routes to infer the origin ASes of announced prefixes, some of which might
16 | not be visible in route collectors from Route Views or RIPE RIS.
17 | 
18 | ## Graph representation
19 | 
20 | ```Cypher
21 | (:AS {asn: 2497})-[:ORIGINATE {count: 4}]->(:BGPPrefix {prefix: '101.128.128.0/17'})
22 | 
23 | ```
24 | 
25 | The `ORIGINATE` relationship contains the property `count` that, similar to the relationship
26 | produced by `bgpkit.pfx2asn`, indicates by how many route collectors the announcement was seen.
27 | A detailed list of collector names is also available via the `seen_by_collectors` property.
28 | 
29 | ## Dependence
30 | 
31 | This crawler may create new `BGPPrefix` nodes that miss the `af` property, so the
32 | `iyp.post.address_family` postprocessing script should be run after this.
33 | 


--------------------------------------------------------------------------------
/iyp/crawlers/pch/daily_routing_snapshots_v4.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.pch import RoutingSnapshotCrawler
 6 | 
 7 | ORG = 'Packet Clearing House'
 8 | URL = 'https://www.pch.net/resources/Routing_Data/IPv4_daily_snapshots/'
 9 | NAME = 'pch.daily_routing_snapshots_v4'
10 | 
11 | 
12 | class Crawler(RoutingSnapshotCrawler):
13 |     def __init__(self, organization, url, name):
14 |         self.name = name
15 |         super().__init__(organization, url, name, af=4)
16 | 
17 | 
18 | def main() -> None:
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('--unit-test', action='store_true')
21 |     args = parser.parse_args()
22 | 
23 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
24 |     logging.basicConfig(
25 |         format=FORMAT,
26 |         filename='log/' + NAME + '.log',
27 |         level=logging.INFO,
28 |         datefmt='%Y-%m-%d %H:%M:%S'
29 |     )
30 | 
31 |     logging.info(f'Started: {sys.argv}')
32 | 
33 |     crawler = Crawler(ORG, URL, NAME)
34 |     if args.unit_test:
35 |         crawler.unit_test()
36 |     else:
37 |         crawler.run()
38 |         crawler.close()
39 |     logging.info(f'Finished: {sys.argv}')
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 |     sys.exit(0)
45 | 


--------------------------------------------------------------------------------
/iyp/crawlers/pch/daily_routing_snapshots_v6.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp.crawlers.pch import RoutingSnapshotCrawler
 6 | 
 7 | ORG = 'Packet Clearing House'
 8 | URL = 'https://www.pch.net/resources/Routing_Data/IPv6_daily_snapshots/'
 9 | NAME = 'pch.daily_routing_snapshots_v6'
10 | 
11 | 
12 | class Crawler(RoutingSnapshotCrawler):
13 |     def __init__(self, organization, url, name):
14 |         self.name = name
15 |         super().__init__(organization, url, name, af=6)
16 | 
17 | 
18 | def main() -> None:
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('--unit-test', action='store_true')
21 |     args = parser.parse_args()
22 | 
23 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
24 |     logging.basicConfig(
25 |         format=FORMAT,
26 |         filename='log/' + NAME + '.log',
27 |         level=logging.INFO,
28 |         datefmt='%Y-%m-%d %H:%M:%S'
29 |     )
30 | 
31 |     logging.info(f'Started: {sys.argv}')
32 | 
33 |     crawler = Crawler(ORG, URL, NAME)
34 |     if args.unit_test:
35 |         crawler.unit_test()
36 |     else:
37 |         crawler.run()
38 |         crawler.close()
39 |     logging.info(f'Finished: {sys.argv}')
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 |     sys.exit(0)
45 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ripe/README.md:
--------------------------------------------------------------------------------
 1 | # RIPE NCC -- https://www.ripe.net/
 2 | 
 3 | The RIPE Network Coordination Centre (RIPE NCC) is the Regional Internet Registry (RIR) for Europe,
 4 | the Middle East and parts of Central Asia.
 5 | 
 6 | IYP does not use RIPE-specific information, but fetches data from RIPE‘s convenient [FTP
 7 | server](https://ftp.ripe.net/).
 8 | 
 9 | ## Graph representation
10 | 
11 | ### AS names - `as_names.py`
12 | 
13 | RIPE NCC provides a simple [list of AS names](https://ftp.ripe.net/ripe/asnames/) (also containing a
14 | country code) which is the base of many AS nodes in IYP.
15 | 
16 | ```Cypher
17 | (:AS {asn: 2497})-[:NAME]->(:Name {name: 'IIJ Internet Initiative Japan Inc.'})
18 | (:AS {asn: 2497})-[:COUNTRY]->(:Country {country_code: 'JP'})
19 | ```
20 | 
21 | ### Route Origin Authorizations - `roa.py`
22 | 
23 | A Route Origin Authorization (ROA) is a cryptographically signed object that states which AS is
24 | authorized to originate a particular IP address prefix or set of prefixes.
25 | 
26 | IYP uses RIPE NCC‘s [mirror of Trust Anchor Locators](https://ftp.ripe.net/rpki/) of the five RIRs
27 | to extract ROA information. The max length specification of the ROA is added as the `maxLength`
28 | property on the relationship.
29 | 
30 | ```Cypher
31 | (:AS {asn: 2497})-[:ROUTE_ORIGIN_AUTHORIZATION {maxLength: 18}]->(:RPKIPrefix {prefix: '49.239.64.0/18'})
32 | ```
33 | 
34 | ### Atlas Probes - `atlas_probes.py`
35 | 
36 | [RIPE Atlas](https://atlas.ripe.net/) is an Internet measurement platform maintained by
37 | the RIPE NCC. RIPE Atlas employs a global network of probes that measure Internet
38 | connectivity and reachability.
39 | 
40 | We fetch the [list of probes](https://atlas.ripe.net/docs/apis/rest-api-manual/probes/)
41 | to model the location of Atlas probes in the graph topology based on the probe's IP(s),
42 | ASN(s), and country.
43 | 
44 | ```Cypher
45 | (:AtlasProbe {id: 6425})-[:LOCATED_IN]->(:AS {asn: 2497})
46 | (:AtlasProbe {id: 6425})-[:COUNTRY]->(:Country {country_code: 'JP'})
47 | (:IP {ip: '202.214.97.16'})-[:ASSIGNED]->(:AtlasProbe {id: 6425})
48 | ```
49 | 
50 | ### Atlas Measurements - `atlas_measurements.py`
51 | 
52 | We fetch the [list of
53 | measurements](https://atlas.ripe.net/docs/apis/rest-api-manual/measurements/)
54 | to obtain metadata of *ongoing* Atlas measurements.  `AtlasProbe`s are `PART_OF`
55 | `AtlasMeasurement`s and measurements `TARGET` one or more `IP`s, a `HostName`, or
56 | both. The Atlas platform also maps the measurement target to an `AS` number if possible.
57 | The crawler includes this relationship as well.
58 | 
59 | To reduce the number of `PART_OF` relationships, this crawler ignores probes that were
60 | never connected or are abandoned.
61 | 
62 | ```Cypher
63 | (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:AS {asn: 2497})
64 | (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:HostName {name: 'jp-tyo-as2497.anchors.atlas.ripe.net'})
65 | (:AtlasProbe {id: 6425})-[:PART_OF]->(:AtlasMeasurement {id: 17635549})-[:TARGET]->(:IP {ip: '202.214.87.158'})
66 | ```
67 | 
68 | ## Dependence
69 | 
70 | The `atlas_measurement` crawler fetches probe IDs for abandoned and never-connected
71 | probes and thus should be run after the `atlas_probes` crawler.
72 | 


--------------------------------------------------------------------------------
/iyp/crawlers/ripe/as_names.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | import requests
 6 | 
 7 | from iyp import BaseCrawler, set_modification_time_from_last_modified_header
 8 | 
 9 | URL = 'https://ftp.ripe.net/ripe/asnames/asn.txt'
10 | ORG = 'RIPE NCC'
11 | NAME = 'ripe.as_names'
12 | 
13 | 
14 | class Crawler(BaseCrawler):
15 | 
16 |     def run(self):
17 |         """Fetch the AS name file from RIPE website and process lines one by one."""
18 | 
19 |         req = requests.get(URL)
20 |         req.raise_for_status()
21 | 
22 |         set_modification_time_from_last_modified_header(self.reference, req)
23 | 
24 |         lines = []
25 |         asns = set()
26 |         names = set()
27 |         countries = set()
28 | 
29 |         # Read asn file
30 |         for line in req.text.splitlines():
31 |             asn, _, name_cc = line.partition(' ')
32 |             name, _, cc = name_cc.rpartition(', ')
33 | 
34 |             if not all((asn, name, cc)) or len(cc) > 2:
35 |                 logging.warning(f'Ignoring invalid line: "{line}"')
36 |                 continue
37 | 
38 |             asn = int(asn)
39 |             lines.append([asn, name, cc])
40 | 
41 |             asns.add(asn)
42 |             names.add(name)
43 |             countries.add(cc)
44 | 
45 |         # get node IDs for ASNs, names, and countries
46 |         asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
47 |         name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names)
48 |         country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', countries)
49 | 
50 |         # Compute links
51 |         name_links = []
52 |         country_links = []
53 | 
54 |         for asn, name, cc in lines:
55 |             asn_qid = asn_id[asn]
56 |             name_qid = name_id[name]
57 |             country_qid = country_id[cc]
58 | 
59 |             name_links.append({'src_id': asn_qid, 'dst_id': name_qid,
60 |                                'props': [self.reference]})  # Set AS name
61 |             country_links.append({'src_id': asn_qid, 'dst_id': country_qid,
62 |                                   'props': [self.reference]})  # Set country
63 | 
64 |         # Push all links to IYP
65 |         self.iyp.batch_add_links('NAME', name_links)
66 |         self.iyp.batch_add_links('COUNTRY', country_links)
67 | 
68 |     def unit_test(self):
69 |         return super().unit_test(['NAME', 'COUNTRY'])
70 | 
71 | 
72 | def main() -> None:
73 |     parser = argparse.ArgumentParser()
74 |     parser.add_argument('--unit-test', action='store_true')
75 |     args = parser.parse_args()
76 | 
77 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
78 |     logging.basicConfig(
79 |         format=FORMAT,
80 |         filename='log/' + NAME + '.log',
81 |         level=logging.INFO,
82 |         datefmt='%Y-%m-%d %H:%M:%S'
83 |     )
84 | 
85 |     logging.info(f'Started: {sys.argv}')
86 | 
87 |     crawler = Crawler(ORG, URL, NAME)
88 |     if args.unit_test:
89 |         crawler.unit_test()
90 |     else:
91 |         crawler.run()
92 |         crawler.close()
93 |     logging.info(f'Finished: {sys.argv}')
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     main()
98 |     sys.exit(0)
99 | 


--------------------------------------------------------------------------------
/iyp/crawlers/simulamet/README.md:
--------------------------------------------------------------------------------
 1 | # rDNS RIR data -- https://rir-data.org/
 2 | 
 3 | "Lowering the Barriers to Working with Public RIR-Level Data" is a joint project of
 4 | SimulaMet and the University of Twente with the goal of making WHOIS, route object
 5 | delegation, and reverse DNS (rDNS) zone files published by Regional Internet Registries
 6 | (RIRs) more accessible.
 7 | 
 8 | IYP imports the rDNS files in a simplified format to indicate which authoritative name
 9 | servers are responsible for a prefix. We do not model PTR records and the corresponding
10 | hierarchy but instead add a simple MANAGED_BY link.
11 | 
12 | ## Graph representation
13 | 
14 | ```cypher
15 | (:RDNSPrefix {prefix: '103.2.57.0/24'})-[:MANAGED_BY {source: 'APNIC', ttl: 172800}]->(:AuthoritativeNameServer {name: 'dns0.iij.ad.jp'})
16 | ```
17 | 
18 | The `source` property indicates from which RIR the information was obtained, the `ttl`
19 | property refers to the time-to-live of the associated SOA record.
20 | 
21 | ## Dependence
22 | 
23 | This crawler is not depending on other crawlers.
24 | 


--------------------------------------------------------------------------------
/iyp/crawlers/stanford/README.md:
--------------------------------------------------------------------------------
 1 | # Stanford's ASdb -- https://asdb.stanford.edu/
 2 | 
 3 | ASdb is a research dataset that maps public autonomous systems (identified by 
 4 | ASN) to organizations and up to three industry types using data from business 
 5 | intelligence databases, website classifiers, and a machine learning algorithm. 
 6 | 
 7 | ## Graph representation
 8 | 
 9 | ### AS tags
10 | Connect AS to tag nodes meaning that an AS has been categorized according to the
11 | given tag.
12 | ```
13 | (:AS {asn:32})-[:CATEGORIZED]-(:Tag {label: 'Colleges, Universities, and Professional Schools'})
14 | ```
15 | 
16 | ## Dependence
17 | 
18 | This crawler is not depending on other crawlers.
19 | 


--------------------------------------------------------------------------------
/iyp/crawlers/tranco/README.md:
--------------------------------------------------------------------------------
 1 | # Tranco -- https://tranco-list.eu/
 2 | 
 3 | The Tranco list is a research-oriented top sites ranking hardened against manipulation. It [combines
 4 | the rankings of several source lists](https://tranco-list.eu/methodology) to produce a daily list
 5 | that is based on data of the past 30 days.
 6 | 
 7 | IYP uses this data to create and annotate DomainName nodes.
 8 | 
 9 | ## Graph representation
10 | 
11 | The rank of the domain is indicated by the `rank` property of the relationship.
12 | 
13 | ```Cypher
14 | (:DomainName {name: 'google.com'})-[:RANK {rank: 1}]->(:Ranking {name: 'Tranco top 1M'})
15 | ```
16 | 
17 | ## Dependence
18 | 
19 | This crawler is not depending on other crawlers.
20 | 


--------------------------------------------------------------------------------
/iyp/crawlers/tranco/top1m.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import io
 3 | import logging
 4 | import sys
 5 | from zipfile import ZipFile
 6 | 
 7 | import requests
 8 | 
 9 | from iyp import BaseCrawler, set_modification_time_from_last_modified_header
10 | 
11 | # URL to Tranco top 1M
12 | URL = 'https://tranco-list.eu/top-1m.csv.zip'
13 | ORG = 'imec-DistriNet'
14 | NAME = 'tranco.top1m'
15 | 
16 | 
17 | class Crawler(BaseCrawler):
18 |     def __init__(self, organization, url, name):
19 |         super().__init__(organization, url, name)
20 |         self.reference['reference_url_info'] = 'https://tranco-list.eu/methodology'
21 | 
22 |     def __set_data_url(self):
23 |         """Set the data URL using the permanent ID of the current list, which stays
24 |         valid once the permalink is updated."""
25 |         try:
26 |             res = requests.get('https://tranco-list.eu/top-1m-id')
27 |             res.raise_for_status()
28 |             self.reference['reference_url_data'] = f'https://tranco-list.eu/download_daily/{res.text}'
29 |         except requests.HTTPError as e:
30 |             logging.warning(f'Failed to update data URL: {e}')
31 | 
32 |     def run(self):
33 |         """Fetch Tranco top 1M and push to IYP."""
34 | 
35 |         self.tranco_qid = self.iyp.get_node('Ranking', {'name': 'Tranco top 1M'})
36 | 
37 |         logging.info('Downloading latest list...')
38 |         req = requests.get(URL)
39 |         req.raise_for_status()
40 | 
41 |         set_modification_time_from_last_modified_header(self.reference, req)
42 |         self.__set_data_url()
43 | 
44 |         links = []
45 |         domains = set()
46 |         # open zip file and read top list
47 |         with ZipFile(io.BytesIO(req.content)) as z:
48 |             with z.open('top-1m.csv') as list:
49 |                 for i, row in enumerate(io.TextIOWrapper(list)):
50 |                     row = row.rstrip()
51 |                     rank, domain = row.split(',')
52 | 
53 |                     domains.add(domain)
54 |                     links.append({'src_name': domain, 'dst_id': self.tranco_qid,
55 |                                  'props': [self.reference, {'rank': int(rank)}]})
56 | 
57 |         name_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', domains)
58 | 
59 |         for link in links:
60 |             link['src_id'] = name_id[link['src_name']]
61 | 
62 |         # Push all links to IYP
63 |         self.iyp.batch_add_links('RANK', links)
64 | 
65 |     def unit_test(self):
66 |         return super().unit_test(['RANK'])
67 | 
68 | 
69 | def main() -> None:
70 |     parser = argparse.ArgumentParser()
71 |     parser.add_argument('--unit-test', action='store_true')
72 |     args = parser.parse_args()
73 | 
74 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
75 |     logging.basicConfig(
76 |         format=FORMAT,
77 |         filename='log/' + NAME + '.log',
78 |         level=logging.INFO,
79 |         datefmt='%Y-%m-%d %H:%M:%S'
80 |     )
81 | 
82 |     logging.info(f'Started: {sys.argv}')
83 | 
84 |     crawler = Crawler(ORG, URL, NAME)
85 |     if args.unit_test:
86 |         crawler.unit_test()
87 |     else:
88 |         crawler.run()
89 |         crawler.close()
90 |     logging.info(f'Finished: {sys.argv}')
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     main()
95 |     sys.exit(0)
96 | 


--------------------------------------------------------------------------------
/iyp/crawlers/virginiatech/README.md:
--------------------------------------------------------------------------------
 1 | # RoVista -- https://rovista.netsecurelab.org/
 2 | 
 3 | > RoVista aims to determine the Routing Origin Validation (ROV) status of network
 4 | > operators.
 5 | >
 6 | > RoV Scores are determined based on the number of RPKI-invalid prefixes reachable by an
 7 | > Autonomous System (AS). Consequently, a higher ROV score suggests that the AS can
 8 | > effectively filter more RPKI-invalid prefixes. However, it is important to note that
 9 | > the RoV score does not conclusively indicate whether an AS has actually implemented
10 | > ROV or not, partly due to limitations in [the] framework and other contributing
11 | > factors.
12 | 
13 | IYP converts these scores (or ratios) to two Tags:
14 | 
15 | - ASes with a ratio greater than 0.5 are categorized as `Validating RPKI ROV`
16 | - ASes with a ratio of less or equal 0.5 are categorized as `Not Validating RPKI ROV`
17 | 
18 | ## Graph representation
19 | 
20 | ```cypher
21 | (:AS {asn: 2497})-[:CATEGORIZED {ratio: 1.0}]->(:Tag {label: 'Validating RPKI ROV'})
22 | (:AS {asn: 6762})-[:CATEGORIZED {ratio: 0}]->(:Tag {label: 'Not Validating RPKI ROV'})
23 | ```
24 | 
25 | ## Dependence
26 | 
27 | This crawler is not depending on other crawlers.
28 | 


--------------------------------------------------------------------------------
/iyp/crawlers/virginiatech/rovista.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import sys
  4 | from datetime import datetime, timezone
  5 | 
  6 | import requests
  7 | 
  8 | from iyp import BaseCrawler
  9 | 
 10 | URL = 'https://api.rovista.netsecurelab.org/rovista/api/overview'
 11 | ORG = 'Virginia Tech'
 12 | NAME = 'virginiatech.rovista'
 13 | 
 14 | 
 15 | class Crawler(BaseCrawler):
 16 |     def __init__(self, organization, url, name):
 17 |         super().__init__(organization, url, name)
 18 |         self.reference['reference_url_info'] = 'https://rovista.netsecurelab.org/'
 19 | 
 20 |     def __set_modification_time(self, entry):
 21 |         try:
 22 |             date_str = entry['lastUpdatedDate']
 23 |             date = datetime.strptime(date_str, '%Y-%m-%d').replace(tzinfo=timezone.utc)
 24 |             self.reference['reference_time_modification'] = date
 25 |         except (KeyError, ValueError) as e:
 26 |             logging.warning(f'Failed to set modification time: {e}')
 27 | 
 28 |     def run(self):
 29 |         """Get RoVista data from their API."""
 30 |         batch_size = 1000
 31 |         offset = 0
 32 |         entries = []
 33 |         asns = set()
 34 | 
 35 |         while True:
 36 |             # Make a request with the current offset
 37 |             response = requests.get(URL, params={'offset': offset, 'count': batch_size})
 38 |             response.raise_for_status()
 39 | 
 40 |             data = response.json().get('data', [])
 41 |             for entry in data:
 42 |                 if not self.reference['reference_time_modification']:
 43 |                     self.__set_modification_time(entry)
 44 |                 asns.add(entry['asn'])
 45 |                 if entry['ratio'] > 0.5:
 46 |                     entries.append({'asn': entry['asn'], 'ratio': entry['ratio']})
 47 |                 else:
 48 |                     entries.append({'asn': entry['asn'], 'ratio': entry['ratio']})
 49 | 
 50 |             # Move to the next page
 51 |             offset += 1
 52 |             # Break the loop if there's no more data
 53 |             if len(data) < batch_size:
 54 |                 break
 55 | 
 56 |         # get ASNs and prefixes IDs
 57 |         self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
 58 |         tag_id_not_valid = self.iyp.get_node('Tag', {'label': 'Not Validating RPKI ROV'})
 59 |         tag_id_valid = self.iyp.get_node('Tag', {'label': 'Validating RPKI ROV'})
 60 |         # Compute links
 61 |         links = []
 62 |         for entry in entries:
 63 |             asn_qid = self.asn_id[entry['asn']]
 64 |             if entry['ratio'] > 0.5:
 65 |                 links.append({'src_id': asn_qid, 'dst_id': tag_id_valid,
 66 |                               'props': [self.reference, {'ratio': entry['ratio']}]})
 67 |             else:
 68 |                 links.append({'src_id': asn_qid, 'dst_id': tag_id_not_valid,
 69 |                              'props': [self.reference, {'ratio': entry['ratio']}]})
 70 | 
 71 |         # Push all links to IYP
 72 |         self.iyp.batch_add_links('CATEGORIZED', links)
 73 | 
 74 |     def unit_test(self):
 75 |         return super().unit_test(['CATEGORIZED'])
 76 | 
 77 | 
 78 | def main() -> None:
 79 |     parser = argparse.ArgumentParser()
 80 |     parser.add_argument('--unit-test', action='store_true')
 81 |     args = parser.parse_args()
 82 | 
 83 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
 84 |     logging.basicConfig(
 85 |         format=FORMAT,
 86 |         filename='log/' + NAME + '.log',
 87 |         level=logging.INFO,
 88 |         datefmt='%Y-%m-%d %H:%M:%S'
 89 |     )
 90 | 
 91 |     logging.info(f'Started: {sys.argv}')
 92 | 
 93 |     crawler = Crawler(ORG, URL, NAME)
 94 |     if args.unit_test:
 95 |         crawler.unit_test()
 96 |     else:
 97 |         crawler.run()
 98 |         crawler.close()
 99 |     logging.info(f'Finished: {sys.argv}')
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     main()
104 |     sys.exit(0)
105 | 


--------------------------------------------------------------------------------
/iyp/crawlers/worldbank/README.md:
--------------------------------------------------------------------------------
 1 | # World Bank's country population -- https://www.worldbank.org/en/home
 2 | 
 3 | > The World Bank Group works in every major area of development. We provide a wide array of financial products and technical assistance, and we help countries share and apply innovative knowledge and solutions to the challenges they face.
 4 | 
 5 | > The World Bank is like a cooperative, made up of 189 member countries. These member countries, or shareholders, are represented by a Board of Governors, who are the ultimate policymakers at the World Bank. Generally, the governors are member countries' ministers of finance or ministers of development. They meet once a year at the Annual Meetings of the Boards of Governors of the World Bank Group and the International Monetary Fund.
 6 | 
 7 | ## Graph representation
 8 | 
 9 | ### Country Estimate
10 | Connect `Country` to an `Estimate` node meaning that a country has an estimated population of `value`.
11 | ```
12 | (:Country)-[:POPULATION {value: 123}]->(:Estimate {name: 'World Bank Population Estimate'})
13 | ```
14 | 
15 | ## Dependence
16 | This crawler depends on crawlers creating Country nodes.


--------------------------------------------------------------------------------
/iyp/crawlers/worldbank/country_pop.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | import sys
 5 | from datetime import datetime, timezone
 6 | 
 7 | import requests
 8 | 
 9 | from iyp import BaseCrawler
10 | 
11 | URL = 'https://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL?per_page=400&mrv=1&format=json'
12 | ORG = 'WorldBank'
13 | NAME = 'worldbank.country_pop'
14 | 
15 | 
16 | class Crawler(BaseCrawler):
17 |     def __init__(self, organization, url, name):
18 |         super().__init__(organization, url, name)
19 |         self.reference['reference_url_info'] = (
20 |             'https://datahelpdesk.worldbank.org/knowledgebase/articles/'
21 |             '889392-about-the-indicators-api-documentation'
22 |         )
23 | 
24 |     def run(self):
25 |         """Get country population from Worldbank API and push it to IYP."""
26 | 
27 |         # Get content
28 |         req = requests.get(URL)
29 |         req.raise_for_status()
30 |         content = json.loads(req.content)
31 | 
32 |         # Set last time of modification
33 |         self.reference['reference_time_modification'] = datetime.strptime(content[0]['lastupdated'],
34 |                                                                           '%Y-%m-%d').replace(tzinfo=timezone.utc)
35 | 
36 |         # Get countries present in IYP cc to id mapping
37 |         country_ids = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', create=False, all=True)
38 | 
39 |         # Get countries and population from World Bank
40 |         lines = set()
41 |         for entry in content[1]:
42 | 
43 |             country = entry['country']['id']
44 |             if country not in country_ids or not entry['value']:
45 |                 continue
46 | 
47 |             population = entry['value']
48 |             lines.add((country, population))
49 | 
50 |         # Get `Estimate` node ID
51 |         estimate_qid = self.iyp.get_node('Estimate', properties={'name': 'World Bank Population Estimate'})
52 | 
53 |         # Compute links
54 |         links = []
55 |         for (country, population) in lines:
56 | 
57 |             country_qid = country_ids[country]
58 | 
59 |             links.append({'src_id': country_qid, 'dst_id': estimate_qid,
60 |                          'props': [self.reference, {'value': population}]})
61 | 
62 |         # Push all links to IYP
63 |         self.iyp.batch_add_links('POPULATION', links)
64 | 
65 |     def unit_test(self):
66 |         return super().unit_test(['POPULATION'])
67 | 
68 | 
69 | def main() -> None:
70 |     parser = argparse.ArgumentParser()
71 |     parser.add_argument('--unit-test', action='store_true')
72 |     args = parser.parse_args()
73 | 
74 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
75 |     logging.basicConfig(
76 |         format=FORMAT,
77 |         filename='log/' + NAME + '.log',
78 |         level=logging.INFO,
79 |         datefmt='%Y-%m-%d %H:%M:%S'
80 |     )
81 | 
82 |     logging.info(f'Started: {sys.argv}')
83 | 
84 |     crawler = Crawler(ORG, URL, NAME)
85 |     if args.unit_test:
86 |         crawler.unit_test()
87 |     else:
88 |         crawler.run()
89 |         crawler.close()
90 |     logging.info(f'Finished: {sys.argv}')
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     main()
95 |     sys.exit(0)
96 | 


--------------------------------------------------------------------------------
/iyp/post/address_family.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | from iyp import BasePostProcess
 6 | 
 7 | NAME = 'post.address_family'
 8 | 
 9 | 
10 | class PostProcess(BasePostProcess):
11 |     def run(self):
12 |         """Add address family (4 or 6 for IPv4 or IPv6) to all IP and Prefix nodes."""
13 | 
14 |         # Update prefixes
15 |         self.iyp.tx.run("MATCH (pfx:Prefix) WHERE pfx.prefix CONTAINS '.' SET pfx.af = 4")
16 |         self.iyp.commit()
17 |         self.iyp.tx.run("MATCH (pfx:Prefix) WHERE pfx.prefix CONTAINS ':' SET pfx.af = 6")
18 |         self.iyp.commit()
19 | 
20 |         # Update IP addresses
21 |         self.iyp.tx.run("MATCH (ip:IP) WHERE ip.ip CONTAINS '.' SET ip.af = 4")
22 |         self.iyp.commit()
23 |         self.iyp.tx.run("MATCH (ip:IP) WHERE ip.ip CONTAINS ':' SET ip.af = 6")
24 |         self.iyp.commit()
25 | 
26 |     def unit_test(self):
27 |         raise NotImplementedError()
28 | 
29 | 
30 | def main() -> None:
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument('--unit-test', action='store_true')
33 |     args = parser.parse_args()
34 | 
35 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
36 |     logging.basicConfig(
37 |         format=FORMAT,
38 |         filename='log/' + NAME + '.log',
39 |         level=logging.INFO,
40 |         datefmt='%Y-%m-%d %H:%M:%S'
41 |     )
42 | 
43 |     logging.info(f'Started: {sys.argv}')
44 | 
45 |     post = PostProcess(NAME)
46 |     if args.unit_test:
47 |         post.unit_test()
48 |     else:
49 |         post.run()
50 |         post.close()
51 |     logging.info(f'Finished: {sys.argv}')
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 |     sys.exit(0)
57 | 


--------------------------------------------------------------------------------
/iyp/post/clean_links.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import sys
  4 | 
  5 | from iyp import BasePostProcess
  6 | 
  7 | NAME = 'post.clean_links'
  8 | 
  9 | 
 10 | class PostProcess(BasePostProcess):
 11 |     def get_links_of_type(self, link_type, prop_dict=None):
 12 |         """Returns a list of all links of a given type with optional properties,
 13 |         including the source and destination nodes.
 14 | 
 15 |         Parameters:
 16 |         - link_type: The type of links to return.
 17 |         - prop_dict: Optional dictionary of properties to return.
 18 | 
 19 |         Returns:
 20 |         - List of links with the specified type and properties, including source and
 21 |         destination nodes.
 22 |         """
 23 |         prop_conditions = (
 24 |             ' AND '.join([f"r.{k} = '{v}'" for k, v in prop_dict.items()])
 25 |             if prop_dict
 26 |             else 'TRUE'
 27 |         )
 28 |         prop_str = ', '.join([f'r.{k}' for k in prop_dict.keys()]) if prop_dict else '*'
 29 | 
 30 |         query = f"""
 31 |         MATCH (src)-[r:{link_type}]->(dst)
 32 |         WHERE {prop_conditions}
 33 |         RETURN elementId(r) AS link_id, elementId(src) AS src_id, elementId(dst) AS dst_id, {prop_str}
 34 |         """
 35 |         result = self.iyp.tx.run(query)
 36 |         if result:
 37 |             return [record for record in result]
 38 |         else:
 39 |             return None
 40 | 
 41 |     def delete_links(self, link_ids):
 42 |         """Deletes all links in the given list.
 43 | 
 44 |         Parameters:
 45 |         - link_ids: List of link IDs to delete.
 46 | 
 47 |         Returns:
 48 |         - None
 49 |         """
 50 |         query = """
 51 |         UNWIND $link_ids AS link_id
 52 |         MATCH ()-[r]->()
 53 |         WHERE elementId(r) = link_id
 54 |         DELETE r
 55 |         """
 56 |         self.iyp.tx.run(query, link_ids=link_ids)
 57 | 
 58 |     def clean_links_of_type(self, link_type, prop_dict=None):
 59 |         links = self.get_links_of_type(link_type, prop_dict)
 60 |         link_dict = {}
 61 |         for link in links:
 62 |             key = (link['src_id'], link['dst_id'])
 63 |             if key not in link_dict:
 64 |                 link_dict[key] = []
 65 |             link_dict[key].append(link['link_id'])
 66 | 
 67 |         # Create the new list of link IDs excluding the first one for each (src_id,
 68 |         # dst_id) pair
 69 |         filtered_link_ids = []
 70 |         for key, link_ids in link_dict.items():
 71 |             if len(link_ids) > 1:
 72 |                 filtered_link_ids.extend(link_ids[1:])
 73 | 
 74 |         self.delete_links(filtered_link_ids)
 75 | 
 76 |     def run(self):
 77 |         # Clean links of all types with the reference_org 'OONI'
 78 |         link_types = ['COUNTRY', 'RESOLVES_TO', 'PART_OF', 'CATEGORIZED']
 79 |         for link_type in link_types:
 80 |             self.clean_links_of_type(link_type, {'reference_org': 'OONI'})
 81 | 
 82 |     def unit_test(self):
 83 |         raise NotImplementedError()
 84 | 
 85 | 
 86 | def main() -> None:
 87 |     parser = argparse.ArgumentParser()
 88 |     parser.add_argument('--unit-test', action='store_true')
 89 |     args = parser.parse_args()
 90 | 
 91 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
 92 |     logging.basicConfig(
 93 |         format=FORMAT,
 94 |         filename='log/' + NAME + '.log',
 95 |         level=logging.INFO,
 96 |         datefmt='%Y-%m-%d %H:%M:%S',
 97 |     )
 98 | 
 99 |     logging.info(f'Started: {sys.argv}')
100 | 
101 |     post = PostProcess(NAME)
102 |     if args.unit_test:
103 |         post.unit_test()
104 |     else:
105 |         post.run()
106 |         post.close()
107 |     logging.info(f'Finished: {sys.argv}')
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     main()
112 |     sys.exit(0)
113 | 


--------------------------------------------------------------------------------
/iyp/post/country_information.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | import iso3166
 6 | 
 7 | from iyp import BasePostProcess
 8 | 
 9 | NAME = 'post.country_information'
10 | 
11 | 
12 | class PostProcess(BasePostProcess):
13 |     def run(self):
14 |         """Enrich Country nodes with additional information like alpha-3 codes and
15 |         country names."""
16 | 
17 |         country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code')
18 | 
19 |         for country_code in country_id:
20 |             if country_code not in iso3166.countries_by_alpha2:
21 |                 logging.error(f'Country code "{country_code}" is not ISO 3166-1 alpha-2 conform.')
22 |                 continue
23 |             country_info = iso3166.countries_by_alpha2[country_code]
24 |             new_props = {'name': country_info.apolitical_name,
25 |                          'alpha3': country_info.alpha3}
26 |             self.iyp.tx.run("""
27 |                             MATCH (n:Country)
28 |                             WHERE elementId(n) = $id
29 |                             SET n += $props
30 |                             """,
31 |                             id=country_id[country_code],
32 |                             props=new_props)
33 |         self.iyp.commit()
34 | 
35 |     def unit_test(self):
36 |         raise NotImplementedError()
37 | 
38 | 
39 | def main() -> None:
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument('--unit-test', action='store_true')
42 |     args = parser.parse_args()
43 | 
44 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
45 |     logging.basicConfig(
46 |         format=FORMAT,
47 |         filename='log/' + NAME + '.log',
48 |         level=logging.INFO,
49 |         datefmt='%Y-%m-%d %H:%M:%S'
50 |     )
51 | 
52 |     logging.info(f'Started: {sys.argv}')
53 | 
54 |     post = PostProcess(NAME)
55 |     if args.unit_test:
56 |         post.unit_test()
57 |     else:
58 |         post.run()
59 |         post.close()
60 |     logging.info(f'Finished: {sys.argv}')
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     main()
65 |     sys.exit(0)
66 | 


--------------------------------------------------------------------------------
/iyp/post/url2hostname.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | 
 5 | import tldextract
 6 | 
 7 | from iyp import BasePostProcess
 8 | 
 9 | NAME = 'post.url2hostname'
10 | 
11 | 
12 | class PostProcess(BasePostProcess):
13 |     def run(self):
14 |         """Link URLs and their corresponding HostNames."""
15 | 
16 |         # Get all URL nodes.
17 |         url_id = self.iyp.batch_get_nodes_by_single_prop('URL', 'url')
18 | 
19 |         # Get all HostName Nodes
20 |         hostname_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name')
21 | 
22 |         # Compute links
23 |         links = []
24 |         for url, url_qid in url_id.items():
25 |             # Extract host name from URL
26 |             hostname = tldextract.extract(url).fqdn
27 | 
28 |             # Get HostName node for the fqdn of the URL
29 |             hostname_qid = hostname_id.get(hostname)
30 | 
31 |             if hostname_qid is not None:
32 |                 links.append({
33 |                     'src_id': url_qid,
34 |                     'dst_id': hostname_qid,
35 |                     'props': [self.reference]
36 |                 })
37 | 
38 |         # push links to IYP
39 |         self.iyp.batch_add_links('PART_OF', links)
40 | 
41 |     def unit_test(self):
42 |         raise NotImplementedError()
43 | 
44 | 
45 | def main() -> None:
46 |     parser = argparse.ArgumentParser()
47 |     parser.add_argument('--unit-test', action='store_true')
48 |     args = parser.parse_args()
49 | 
50 |     FORMAT = '%(asctime)s %(levelname)s %(message)s'
51 |     logging.basicConfig(
52 |         format=FORMAT,
53 |         filename='log/' + NAME + '.log',
54 |         level=logging.INFO,
55 |         datefmt='%Y-%m-%d %H:%M:%S'
56 |     )
57 | 
58 |     logging.info(f'Started: {sys.argv}')
59 | 
60 |     post = PostProcess(NAME)
61 |     if args.unit_test:
62 |         post.unit_test()
63 |     else:
64 |         post.run()
65 |         post.close()
66 |     logging.info(f'Finished: {sys.argv}')
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     main()
71 |     sys.exit(0)
72 | 


--------------------------------------------------------------------------------
/log/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InternetHealthReport/internet-yellow-pages/c79ced8783c73f282d273a886f4aa5cf8018fdd9/log/.gitkeep


--------------------------------------------------------------------------------
/public/README.md:
--------------------------------------------------------------------------------
 1 | # IYP public instance
 2 | IYP is served behind a NGINX reverse proxy. 
 3 | The [landing page](https://github.com/InternetHealthReport/iyp-website/) is at the root and the `iyp/` path is pointing to the neo4j instance (`public_notls`).
 4 | 
 5 | Example nginx configuration:
 6 | ```
 7 | location ^~ /iyp/ {
 8 |    proxy_pass http://10.255.255.11:7474/;
 9 |  }
10 | ```
11 | 
12 | The BOLT endpoint (iyp-bolt.iijlab.net) points directly to Neo4j's 7687 port.
13 | 
14 | # Note
15 | The `public_tls` configuration is not currently used. This is required if the machine serving IYP is publicly accessible and directly accessed by clients.
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | neo4j
 2 | iso3166
 3 | progressbar2
 4 | arrow
 5 | tldextract
 6 | py-radix
 7 | requests
 8 | urllib3
 9 | chardet
10 | bs4
11 | requests-cache
12 | lz4
13 | frozendict
14 | docker
15 | boto3
16 | botocore
17 | pandas
18 | fastparquet
19 | flatdict
20 | requests_futures
21 | autopep8
22 | flake8
23 | pre-commit
24 | PyGithub
25 | clickhouse_driver
26 | paramiko
27 | scp
28 | 


--------------------------------------------------------------------------------
/run_all_crawlers.sh:
--------------------------------------------------------------------------------
 1 | echo "Starting all crawlers"
 2 | date
 3 | 
 4 | # MANRS
 5 | python3 -m iyp.crawlers.manrs.members                                                                                                                                                                                                                                                                                                            
 6 | 
 7 | # AS Names
 8 | python3 -m iyp.crawlers.ripe.as_names                                                                                                                                                                                                                                                                                                            
 9 | python3 -m iyp.crawlers.bgptools.as_names
10 | python3 -m iyp.crawlers.emileaben.as_names
11 | 
12 | # AS Peers
13 | python3 -m iyp.crawlers.inetintel.siblings_asdb
14 | 
15 | # Rankings
16 | python3 -m iyp.crawlers.apnic.eyeball
17 | python3 -m iyp.crawlers.caida.asrank                                                                                                                                                                                                                                                                                                            
18 | python3 -m iyp.crawlers.ihr.country_dependency                                                                                                                                                                                                                                                                                                            
19 | 
20 | # BGP data
21 | python3 -m iyp.crawlers.bgpkit.pfx2asn
22 | python3 -m iyp.crawlers.bgpkit.as2rel   
23 | python3 -m iyp.crawlers.bgpkit.peerstats   
24 | python3 -m iyp.crawlers.ripe.roa                                                                                                                                                                                                                                                                                                            
25 | 
26 | # IHR
27 | python3 -m iyp.crawlers.ihr.local_hegemony
28 | python3 -m iyp.crawlers.ihr.rov
29 | 
30 | # DNS
31 | python3 -m iyp.crawlers.tranco.top1M
32 | python3 -m iyp.crawlers.cloudflare.top100
33 | 
34 | #BGP.Tools tags, and anycast prefixes
35 | python3 -m iyp.crawlers.bgptools.tags
36 | python3 -m iyp.crawlers.bgptools.anycast_prefixes
37 | 
38 | #PeeringDB
39 | python3 -m iyp.crawlers.peeringdb.org
40 | python3 -m iyp.crawlers.peeringdb.ix
41 | 
42 | # Delegated files
43 | python3 -m iyp.crawlers.nro.delegated_stats
44 | 
45 | # URL data
46 | python3 -m iyp.crawlers.citizenlab.urldb
47 | 
48 | # OONI
49 | python3 -m iyp.crawlers.ooni.webconnectivity
50 | python3 -m iyp.crawlers.ooni.facebookmessenger
51 | python3 -m iyp.crawlers.ooni.signal
52 | python3 -m iyp.crawlers.ooni.telegram
53 | python3 -m iyp.crawlers.ooni.whatsapp
54 | python3 -m iyp.crawlers.ooni.httpheaderfieldmanipulation
55 | python3 -m iyp.crawlers.ooni.httpinvalidrequestline
56 | python3 -m iyp.crawlers.ooni.psiphon
57 | python3 -m iyp.crawlers.ooni.riseupvpn
58 | python3 -m iyp.crawlers.ooni.stunreachability
59 | python3 -m iyp.crawlers.ooni.tor
60 | python3 -m iyp.crawlers.ooni.torsf
61 | python3 -m iyp.crawlers.ooni.vanillator
62 | 
63 | echo "All crawlers finished"
64 | date
65 | 


--------------------------------------------------------------------------------
/send_email.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import smtplib
 4 | from email.message import EmailMessage
 5 | from email.utils import formataddr
 6 | 
 7 | # Load configuration file
 8 | with open('config.json', 'r') as fp:
 9 |     conf = json.load(fp)
10 | 
11 | 
12 | def send_email(e):
13 |     email_config = conf.get('email')
14 |     if email_config is not None \
15 |             and email_config['email_address'] != '' \
16 |             and email_config['smtp_server'] != '' \
17 |             and email_config['smtp_port'] != '' \
18 |             and email_config['username'] != '' \
19 |             and email_config['password'] != '':
20 |         # Send an email
21 |         sender = email_config['email_address']
22 |         recipient = sender
23 |         smtp_server = email_config['smtp_server']
24 |         smtp_port = email_config['smtp_port']
25 |         username = email_config['username']
26 |         password = email_config['password']
27 | 
28 |         msg = EmailMessage()
29 |         msg['From'] = formataddr(('Internet Yellow Pages.', f'{sender}'))
30 |         msg['To'] = recipient
31 |         msg['Subject'] = 'Error when running create_db script'
32 |         msg.set_content(
33 |             f"""\
34 |                 An exception was caught during run of create_db script:\n\n{str(e)}
35 |                 """
36 |         )
37 |         msg.add_alternative(
38 |             f"""\
39 |                 <html>
40 |                   <body>
41 |                     <p>An exception was caught during run of create_db script:</p>
42 |                     <p>{str(e)}</p>
43 |                   </body>
44 |                 </html>
45 |                 """,
46 |             subtype='html',
47 |         )
48 | 
49 |         with smtplib.SMTP(smtp_server, smtp_port) as server:
50 |             server.starttls()
51 |             server.login(username, password)
52 |             server.sendmail(sender, recipient, msg.as_string())
53 |             logging.info('Email sent')
54 |             server.quit()
55 |     else:
56 |         print('Email credentials not found in config file')
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     send_email('Error when running Crawler 6')
61 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 | 
3 | setup(
4 |     version='0.0.1',
5 |     name='iyp',
6 |     packages=find_packages(),
7 | )
8 | 


--------------------------------------------------------------------------------
/unit_test.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import subprocess
 3 | 
 4 | 
 5 | def run_crawler(crawler):
 6 |     print('Running Crawler: ' + crawler)
 7 |     subprocess.call(['python3', '-m', crawler, 'unit_test'])
 8 | 
 9 | 
10 | def run_post_script(post_script):
11 |     print('Running Post Script: ' + post_script)
12 |     subprocess.call(['python3', '-m', post_script, 'unit_test'])
13 | 
14 | 
15 | with open('config.json') as config_str:
16 | 
17 |     config = json.load(config_str)
18 | 
19 |     # read crawlers info and starrt unit testing of the crawlers
20 |     crawlers = config['iyp']['crawlers']
21 |     for crawler in crawlers:
22 |         run_crawler(crawler)
23 | 
24 |     # read post scripts info and start unit testing of the post scripts
25 |     post_scripts = config['post']
26 |     for post_script in post_scripts:
27 |         run_post_script(post_script)
28 | 


--------------------------------------------------------------------------------