├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Pipfile ├── README.md ├── development.md ├── netanalysis ├── __init__.py ├── blocktest │ ├── README.md │ └── measure.sh ├── dns │ ├── __init__.py │ ├── analysis │ │ ├── DomainAnalysis.ipynb │ │ ├── __init__.py │ │ ├── analysis_app.py │ │ ├── analyze_dns.md │ │ ├── analyze_domain.py │ │ ├── classifier.py │ │ ├── graph.py │ │ └── ip_info_widget.py │ ├── data │ │ ├── __init__.py │ │ ├── model.py │ │ └── serialization.py │ └── google │ │ ├── __init__.py │ │ ├── get_google_dns_locations.sh │ │ ├── google_dns.py │ │ ├── google_dns_locations.txt │ │ ├── is_google_dns.py │ │ └── test_google_dns.py ├── infrastructure │ ├── __init__.py │ └── resources.py ├── ip │ ├── __init__.py │ ├── ip_info.py │ ├── ip_info_test.py │ ├── model.py │ ├── simple_autonomous_system.py │ └── test_simple_autonomous_system.py ├── ooni │ ├── README.md │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── ooni_client.py │ │ └── sync_measurements.py │ └── measurements_to_dns_records.py ├── third_party ├── tls │ ├── __init__.py │ └── domain_ip_validator.py └── traffic │ ├── README.md │ ├── __init__.py │ ├── analysis │ ├── TrafficCorrelations.html │ ├── TrafficCorrelations.ipynb │ ├── __init__.py │ ├── find_anomalies.py │ └── model.py │ └── data │ ├── __init__.py │ ├── api_repository.py │ ├── fetch_google_traffic.py │ ├── file_repository.py │ └── model.py ├── pipenv.sh ├── setup.py ├── third_party ├── caida.org │ ├── README.md │ ├── as-classification │ │ ├── METADATA │ │ └── as2types.txt.gz │ ├── as-organizations │ │ ├── METADATA │ │ ├── README.txt │ │ └── as-org2info.txt.gz │ └── caida_pub_aua.pdf └── db-ip │ ├── dbip-asn-lite │ ├── COPYRIGHT.txt │ ├── LICENSE.txt │ ├── METADATA │ └── dbip-asn-lite.mmdb │ └── dbip-country-lite │ ├── COPYRIGHT.txt │ ├── LICENSE.txt │ ├── METADATA │ └── dbip-country-lite.mmdb └── tools ├── ipython.py └── jupyter.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__ 3 | .ipynb_checkpoints 4 | .venv 5 | .mypy_cache 6 | 7 | # Pipfile.lock is generated from pipenv install 8 | Pipfile.lock 9 | 10 | # Bazel output 11 | /bazel-* 12 | 13 | /ooni_data 14 | /traffic_data 15 | .DS_Store 16 | 17 | # IDEs 18 | .idea/ 19 | .vscode/ 20 | 21 | # Build output 22 | build/ 23 | dist/ 24 | jigsaw_net_analysis.egg-info/ 25 | __init__.pyc 26 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution, 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Wishlist 26 | 27 | Here are some things we'd like to see implemented: 28 | 29 | * Scalable and reproducible OONI data fetch 30 | * Fetch data from their Amazon S3 bucket ([info](https://ooni.torproject.org/post/mining-ooni-data/)) 31 | * Store OONI data in a compact and queriable format. Consider [Parquet](https://pypi.python.org/pypi/parquet) or [Avro](https://github.com/tebeka/fastavro). 32 | * Use Apache Spark to pre-process data on the cloud to significantly reduce and speed up data transfer. 33 | 34 | * Enhance analysis 35 | * Provide site-level analysis with timeline and high-level explanations. 36 | * Provide country-level analysis, listing measured sites, and explaining types of interference found. 37 | * Collect a list of blackhole IPs that each country uses. 38 | 39 | * Make analysis accessible to non-technical people 40 | * Create a web application with analysis that non-technical people can undertand. Can piggyback on the analysis from the AnalyzeDomain notebook. 41 | 42 | * Scale Analysis 43 | * Figure our a way to classify every edge in the DNS graph in batch, rather than on-demand. 44 | * Create or consume a IP -> TLS certificate data source to minimize connections for the TLS validation. 45 | 46 | * Add new DNS data sources 47 | * Include [DNS queries from Satellite](https://scans.io/study/washington-dns). Explore other sources. 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Jigsaw Operations LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Include everything under third_party as a data dependency 16 | graft netanalysis/third_party 17 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Jigsaw Operations LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | [[source]] 16 | name = "pypi" 17 | url = "https://pypi.org/simple" 18 | verify_ssl = true 19 | 20 | [dev-packages] 21 | pylint = "*" 22 | autopep8 = "*" 23 | mypy = "*" 24 | rope = "*" 25 | 26 | [packages] 27 | jigsaw-net-analysis = {editable = true,path = "."} 28 | 29 | [requires] 30 | python_version = "3.6.9" 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tools for Network Analysis 2 | 3 | This repository contains tools, libraries and applications to analyze network measurements. 4 | 5 | _Disclaimer: This is not an officially supported Jigsaw product._ 6 | 7 | 8 | > **Experimental code.** We may break you. Please let us know if you are using this code. 9 | 10 | ## Installation 11 | 12 | netanalysis is not in PyPI. Instead you can install straight from our Github repository: 13 | 14 | `pip install git+git://github.com/Jigsaw-Code/net-analysis.git@master` 15 | 16 | On a `Pipfile`, you can do: 17 | 18 | ``` 19 | [packages] 20 | netanalysis = {git = "git://github.com/Jigsaw-Code/net-analysis.git", ref = "master"} 21 | ``` 22 | 23 | You should specify a version tag or a commit if the `master` branch breaks you. 24 | 25 | 26 | ## Tutorials 27 | 28 | ### Analyze DNS Measurements 29 | 30 | [Analyze DNS Queries](netanalysis/analysis/analyze_dns.md) describes how to fetch data from OONI and check for interference. 31 | 32 | [netanalysis/analysis/DomainAnalysis.ipynb](netanalysis/analysis/DomainAnalysis.ipynb) gives you an idea of what the analysis look like. 33 | 34 | ### Fetch Google Traffic Data for analysis: 35 | 36 | This uses the Google Transparency Report internal API. You must agree to [Google APIs Terms of Service](https://developers.google.com/terms/). 37 | 38 | ``` 39 | python -m netanalysis.traffic.data.fetch_google_traffic --output_dir=traffic_data/ 40 | ``` 41 | 42 | See [netanalysis/traffic/README.md](netanalysis/traffic/README.md) for details. 43 | 44 | 45 | ### Get information about an IP address: 46 | 47 | ``` 48 | python -m netanalysis.ip.ip_info 8.8.8.8 49 | ``` 50 | 51 | ### Check if an IP is valid for a domain: 52 | 53 | ``` 54 | python -m netanalysis.tls.domain_ip_validator jigsaw.google.com 172.217.10.78 55 | ``` 56 | 57 | ## Development 58 | 59 | See [Development Setup](development.md). 60 | 61 | We welcome your contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for details and wishlist. 62 | -------------------------------------------------------------------------------- /development.md: -------------------------------------------------------------------------------- 1 | # Python environment set up 2 | 3 | ## Install Python 3 and pipenv 4 | 5 | Install Python 3 if you don't have it already: https://www.python.org/downloads/ 6 | 7 | Install [Pipenv](https://docs.pipenv.org/) to manage Python environments and pip dependencies 8 | 9 | ```sh 10 | python3 -m pip install pipenv 11 | ``` 12 | 13 | (You may need to source .bash_profile) 14 | 15 | ## Set up Pipenv Virtual Environment 16 | 17 | Development should be done inside a Pipenv environment. Create and enter your Pipenv environment with: 18 | 19 | ```sh 20 | PIPENV_VENV_IN_PROJECT=1 python3 -m pipenv install --dev 21 | PIPENV_VENV_IN_PROJECT=1 python3 -m pipenv shell 22 | ``` 23 | 24 | `PIPENV_VENV_IN_PROJECT` will place the virtual environment in `./.venv/`. 25 | 26 | On macOS you may get an error if you don't have the needed developer tools. You can use `xcode-select --install` to fix that. 27 | 28 | 29 | ### Add External Dependencies 30 | 31 | Library dependencies must be listed in [setup.py](setup.py). 32 | 33 | Dependencies for the development environment, such as development tools, should be listed in the [Pipfile](Pipfile). 34 | 35 | After adding new dependencies, run `pipenv install` to refresh the environment. 36 | 37 | ## Test 38 | 39 | Run the linter: 40 | ``` 41 | pylint netanalysis 42 | ``` 43 | 44 | Run the type checker: 45 | ``` 46 | mypy netanalysis 47 | ``` 48 | 49 | Run the unit tests: 50 | ```sh 51 | python -m unittest -v 52 | ``` 53 | -------------------------------------------------------------------------------- /netanalysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/__init__.py -------------------------------------------------------------------------------- /netanalysis/blocktest/README.md: -------------------------------------------------------------------------------- 1 | # Website Blocking Test 2 | 3 | To test if and how a domain `$DOMAIN` is blocked, run: 4 | ``` 5 | ./measure.sh $DOMAIN 6 | ``` 7 | 8 | Here is an example output for SNI-based blocking of Signal in Uzbekistan: 9 | ``` 10 | $ bash measure.sh www.signal.org 11 | time: Sat Jul 3 18:05:15 UTC 2021 12 | domain: www.signal.org 13 | client_country: UZ 14 | client_as: AS8193 Uzbektelekom Joint Stock Company 15 | 16 | DNS_INJECTION 17 | root_nameserver: l.root-servers.net. 18 | query_response: status=NOERROR, num_answers=0, num_authorities=6 19 | analysis: OK - Received expected response 20 | SYSTEM_RESOLVER 21 | resolver_country: UZ 22 | resolver_ip: 185.74.5.210 23 | resolver_as: AS202660 Uzbektelekom Joint Stock Company 24 | response_ips: 13.32.123.28 13.32.123.33 13.32.123.43 13.32.123.56 25 | ips_from_doh: 13.32.123.28 13.32.123.33 13.32.123.43 13.32.123.56 26 | analysis: OK - Response IPs [13.32.123.28 13.32.123.33 13.32.123.43 13.32.123.56] were found on dns.google.com using DNS-over-HTTPS 27 | HTTP 28 | analysis: OK - Got expected response 29 | SNI 30 | analysis: INCONCLUSIVE - Failed to get TLS ServerHello (curl: (28) Operation timed out after 5001 milliseconds with 0 out of 0 bytes received) 31 | ``` 32 | 33 | 34 | For more examples of this script being used in practice, see the [discussion on blocking in Uzbekistan in July 2021](https://ntc.party/t/twitter-tik-tok-skype/1122). 35 | -------------------------------------------------------------------------------- /netanalysis/blocktest/measure.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Jigsaw Operations LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | set -u 16 | 17 | # Curl Errors (from `man libcurl-errors`) 18 | declare -ir CURLE_OK=0 19 | declare -ir CURLE_COULDNT_CONNECT=7 # We get this for connection refused. 20 | declare -ir CURLE_OPERATION_TIMEDOUT=28 21 | declare -ir CURLE_SSL_CONNECT_ERROR=35 22 | declare -ir CURLE_PEER_FAILED_VERIFICATION=51 23 | declare -ir CURLE_GOT_NOTHING=52 # Injected FIN triggers this. 24 | declare -ir CURLE_RECV_ERROR=56 # We get this for connection reset by peer. 25 | declare -ir CURLE_SSL_CACERT=60 # Could be MITM. 26 | 27 | function test_connectivity() { 28 | # Test signal 29 | local response 30 | # The gstatic.com url will return status 204 and no body. 31 | # It's HTTP so captive portals can intercept with a login page. 32 | response=$(curl --silent --dump-header - http://connectivitycheck.gstatic.com/generate_204 2> /dev/null) 33 | if (($? != 0)); then 34 | echo "You are OFFLINE (Failed to fetch http://connectivitycheck.gstatic.com/generate_204)" 35 | return 1 36 | fi 37 | # Test captive portal 38 | local status=$(echo $response | head -1 | cut -d' ' -f 2) 39 | if ((status != "204")); then 40 | echo "You are OFFLINE (Captive portal detected)" 41 | return 2 42 | fi 43 | return 0 44 | } 45 | 46 | function print_client_info() { 47 | declare -r client_info="$(curl --silent https://ipinfo.io | sed 's/ *"\(.*\)": "\(.*\)",$/\1: \2/')" 48 | echo client_country: $(echo "$client_info" | grep '^country:' | cut -d' ' -f 2-) 49 | echo client_as: $(echo "$client_info" | grep '^org:' | cut -d' ' -f 2-) 50 | } 51 | 52 | # Test for DNS injection. 53 | # It queries a root nameserver for the domain and expects a response with 54 | # NOERROR, no answers and the list of nameservers for the domain's TLD. 55 | # This method is superior to sending the query to a blackhole because 56 | # it can provide positive confirmation that the query was not discarded. 57 | # It relies on the high capacity and availability of the root nameservers 58 | # and the fact that they are not blockable due to substantial collateral damage. 59 | # TODO: Test TCP and upper case. 60 | function test_dns_injection() { 61 | echo "DNS_INJECTION" 62 | declare -r domain=$1 63 | declare -r root_nameserver=$(dig +short . ns | head -1) 64 | if [[ -z "$root_nameserver" ]]; then 65 | echo " root_nameserver_error: Could not get root nameserver" 66 | echo " analysis: INCONCLUSIVE - Could not run test" 67 | return 2 68 | fi 69 | echo " root_nameserver: $root_nameserver" 70 | declare response 71 | if ! response=$(dig +time=2 @$root_nameserver $domain); then 72 | echo " query_error: Could not get response" 73 | echo " analysis: INTERFERENCE - Could not get response" 74 | return 1 75 | fi 76 | declare -r status=$(echo $response | grep -oE 'status: \w+' | cut -d ' ' -f 2) 77 | declare -ri num_answers=$(echo $response | grep -oE 'ANSWER: \w+' | cut -d ' ' -f 2) 78 | declare -ri num_authorities=$(echo $response | grep -oE 'AUTHORITY: \w+' | cut -d ' ' -f 2) 79 | echo " query_response: status=$status, num_answers=$num_answers, num_authorities=$num_authorities" 80 | if [[ $status == 'NOERROR' && $num_answers == 0 && $num_authorities -ge 1 ]]; then 81 | echo " analysis: OK - Received expected response" 82 | return 0 83 | fi 84 | echo " analysis: INTERFERENCE - Received unexpected response: $response" 85 | return 1 86 | } 87 | 88 | # Tests DNS interference. First tries to detect injection. If no injection, 89 | # also tests the system resolver and verify whether the returned IPs are valid for 90 | # the test domain. 91 | function test_dns_blocking() { 92 | local domain=$1 93 | test_dns_injection $domain 94 | if [[ $? == 1 ]]; then 95 | # We don't test the system resolver because we know reponses are injected. 96 | # TODO: Consider running the system resolver test anyway, since some ISPs redirect 97 | # all DNS traffic to their local resolver, even if they do not block. 98 | return 99 | fi 100 | 101 | echo "SYSTEM_RESOLVER" 102 | declare -r resolver_ip="$(dig +short TXT whoami.ds.akahelp.net | grep ns | cut -d\" -f 4)" 103 | declare -r resolver_info="$(curl --silent https://ipinfo.io/${resolver_ip} | sed 's/ *"\(.*\)": "\(.*\)",$/\1: \2/')" 104 | echo " resolver_country: $(echo "$resolver_info" | grep '^country:' | cut -d' ' -f 2-)" 105 | echo " resolver_ip: $(echo "$resolver_info" | grep '^ip:' | cut -d' ' -f 2-)" 106 | echo " resolver_as: $(echo "$resolver_info" | grep '^org:' | cut -d' ' -f 2-)" 107 | 108 | declare -r ips=$(dig +dnssec +short $domain | grep -o -E '([0-9]+\.){3}[0-9]+' | sort) 109 | echo " response_ips: "$ips 110 | if [[ $ips == "" ]]; then 111 | echo " analysis: INTERFERENCE - Did not get any IPs from the resolver" 112 | return 1 113 | fi 114 | 115 | # Test if IPs are valid for a given domain. 116 | # We first check if it's a globally addressable IP (not localhost, local network etc.) 117 | # Then we query Google DoH to get the IPs and use that as ground truth. If there's 118 | # overlap, we conclude the IPs are valid. 119 | # That may fail for load or geo balanced servers. In that case they will likely support 120 | # HTTPS. If the IP can successfuly establish a TLS connection for the domain, that's proof 121 | # the IP is valid for the domain. 122 | # The (ip, domain) validation is vulnerable to censorship (IP and SNI-based blocking), but 123 | # it does not have to happen at the test network. We could provide a backend for that instead. 124 | local ip=$(echo "$ips" | head -1) 125 | local ip_info=$(curl --silent "https://ipinfo.io/$ip") 126 | if echo "$ip_info" | grep "bogon" > /dev/null; then 127 | echo " analysis: INTERFERENCE - Response IP $ip is a bogon" 128 | return 1 129 | fi 130 | 131 | # Validate IPs by comparing to a trusted resolution. 132 | # Hardcode IP address to bypass potential DNS blocking. 133 | # dns.google.com may still be blocked by IP or SNI. We use upper case domain to bypass some SNI blocking. 134 | # TODO: Check if dns.google.com is IP or SNI blocked. 135 | # TODO: Use ClientHello split to avoid more SNI blocking. 136 | # TODO: Run a DoH server for measurements on a shared IP address. 137 | # TODO: Recurse in case of blocking. Needs to follow CNAMES. That would still be vulnerable to blocked authoritatives. 138 | local ips_from_doh=$(curl --silent --connect-to ::8.8.8.8: https://DNS.GOOGLE/resolve?name=$domain | grep -o -E '([0-9]+\.){3}[0-9]+' | sort) 139 | echo " ips_from_doh: " $ips_from_doh 140 | local common_ips=$(comm -12 <(echo "$ips") <(echo "$ips_from_doh")) 141 | if (( $(echo "$common_ips" | wc -w) > 0)); then 142 | echo " analysis: OK - Response IPs ["$common_ips"] were found on dns.google.com using DNS-over-HTTPS" 143 | return 0 144 | fi 145 | 146 | # Validate IPs by establishing a TLS connection. This is vulnerable to IP-based blocking. 147 | # Upper case domain may bypass SNI censorship, reducing inconclusive cases. 148 | local upper_domain=$(echo $domain | tr [:lower:] [:upper:]) 149 | local curl_error 150 | curl_error=$(curl --silent --show-error --connect-to ::$ip: https://$upper_domain/ 2>&1 > /dev/null) 151 | local result=$? 152 | echo " tls_test: ip=$ip, error=$result" 153 | if ((result == CURLE_OK)) ; then 154 | echo " analysis: OK - Response IP $ip produced certificate for domain $domain" 155 | return 0 156 | elif ((result == CURLE_PEER_FAILED_VERIFICATION)); then 157 | echo " analysis: INTERFERENCE - Response $ip could not produce domain certificate" 158 | return 1 159 | elif ((result == CURLE_SSL_CACERT)); then 160 | echo " analysis: INTERFERENCE - Response $ip returned a certificate with an invalid CA" 161 | return 1 162 | else 163 | echo " analysis: INCONCLUSIVE - Could not validate ips ["$ips"]. TLS test failed ($curl_error)" 164 | return 2 165 | fi 166 | # Other tests to try: 167 | # - Recurse with dnssec and qname minimization 168 | } 169 | 170 | # The HTTP test works by connecting to a well-behaved baseline that always returns the same output 171 | # on invalid hostname. We then compare the output for our test domain and a domain we know 172 | # is invalid. If the result changes, then we know there was injection. 173 | function test_http_blocking() { 174 | echo "HTTP" 175 | local domain=$1 176 | # TODO: use a domain we control instead of example.com, which may change without notice. 177 | # TODO: This breaks if the test domain is hosted in the target host. 178 | # TODO: This may capture censorship happening in the test server network. 179 | local http_response 180 | http_response=$(curl --silent --show-error --max-time 5 --connect-to ::example.com: http://$domain/ 2>&1) 181 | local http_result=$? 182 | if ((http_result == CURLE_OK)); then 183 | local expected_reponse=$(curl --silent --show-error --max-time 5 --connect-to ::example.com: http://inexistent.example.com/ 2>&1) 184 | if diff <(echo "$http_response") <(echo "$expected_reponse") > /dev/null; then 185 | echo " analysis: OK - Got expected response" 186 | else 187 | echo " analysis: INTERFERENCE - Got injected response" 188 | diff <(echo "$http_response") <(echo "$expected_reponse") 189 | fi 190 | elif ((http_result == CURLE_GOT_NOTHING)); then 191 | echo " analysis: INTERFERENCE - Unexpected empty response when Host is $domain($http_response)" 192 | elif ((http_result == CURLE_RECV_ERROR)); then 193 | echo " analysis: INTERFERENCE - Cannot from established connection when Host is $domain($http_response)" 194 | elif ((http_result == CURLE_OPERATION_TIMEDOUT)); then 195 | echo " analysis: LIKELY_INTERFERENCE - Unexpected time out when Host is $domain ($http_response)" 196 | elif ((http_result == CURLE_COULDNT_CONNECT)); then 197 | echo " analysis: INCONCLUSIVE - Failed to connect to innocuous domain ($http_response)" 198 | else 199 | # TODO: Find out what errors are guaranteed blocking. 200 | echo " analysis: INCONCLUSIVE - Failed to fetch test domain from innocuous domain ($http_response)" 201 | fi 202 | } 203 | 204 | # The test for SNI-triggered blocking works by connecting to a well-behaved TLS server we know 205 | # and checking if we can get a ServerHello back when presenting the test domain. If we 206 | # get a response without a ServerHello, which may be empty or a reset, we know it's blocked. 207 | # If we get a ServerHello and the CA chain is valid, then we know there was no injection and 208 | # can conclude there's no blocking. 209 | function test_sni_blocking() { 210 | echo "SNI" 211 | local domain=$1 212 | # The `local` call will override `#?`, so we don't assign on the declaration. 213 | # Consider using curl --http1.1 https://example.com/ --write-out 'tls_error=%{ssl_verify_result} http_status=%{http_code} header_size=%{size_header} body_size=%{size_download} redirect_url=%{redirect_url} dns=%{time_namelookup} tcp_connect=%{time_connect} tls_connect=%{time_appconnect} request_start=%{time_pretransfer} first_response_byte=%{time_starttransfer}\n' --insecure 214 | # See https://blog.cloudflare.com/a-question-of-timing/ 215 | local curl_error 216 | curl_error=$(curl --silent --show-error --max-time 5 --connect-to ::example.com: "https://$domain/" 2>&1 >/dev/null) 217 | curl_result=$? 218 | if ((curl_result == CURLE_PEER_FAILED_VERIFICATION || curl_result == CURLE_OK)); then 219 | echo " analysis: OK - Got TLS ServerHello" 220 | elif ((curl_result == CURLE_SSL_CACERT)) && \ 221 | [[ "$curl_error" =~ "no alternative certificate subject name matches target host name" ]]; then 222 | # On Linux curl outputs CURLE_SSL_CACERT for invalid subject name 🤷. 223 | echo " analysis: OK - Got TLS ServerHello" 224 | elif ((curl_result == CURLE_GOT_NOTHING)); then 225 | echo " analysis: INTERFERENCE - Unexpected empty response when SNI is $domain ($curl_error)" 226 | elif ((curl_result == CURLE_SSL_CONNECT_ERROR)); then 227 | echo " analysis: LIKELY_INTERFERENCE - Unexpected TLS error when SNI is $domain ($curl_error)" 228 | else 229 | # TODO: Check for invalid CA chain: that indicates the server is misconfigured or 230 | # there's MITM going on. 231 | # TODO: Figure out what errors are guaranteed blocking. 232 | echo " analysis: INCONCLUSIVE - Failed to get TLS ServerHello ($curl_error)" 233 | fi 234 | } 235 | 236 | function main() { 237 | echo time: "$(date -u)" 238 | if ! test_connectivity; then 239 | return 1 240 | fi 241 | 242 | local domain=$1 243 | echo domain: $domain 244 | print_client_info 245 | echo 246 | 247 | test_dns_blocking $domain 248 | test_http_blocking $domain 249 | test_sni_blocking $domain 250 | 251 | # TODO: Test IP blocking 252 | } 253 | 254 | main $1 255 | -------------------------------------------------------------------------------- /netanalysis/dns/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/dns/__init__.py -------------------------------------------------------------------------------- /netanalysis/dns/analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/dns/analysis/__init__.py -------------------------------------------------------------------------------- /netanalysis/dns/analysis/analysis_app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2018 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import ipaddress 18 | import os.path 19 | 20 | from IPython.display import display 21 | import ipywidgets as widgets 22 | from matplotlib import pyplot 23 | 24 | from netanalysis.dns.analysis import classifier as dc 25 | from netanalysis.dns.analysis import graph as dg 26 | from netanalysis.tls import domain_ip_validator 27 | 28 | 29 | class AnalysisApp: 30 | def __init__(self, measurements_dir: str) -> None: 31 | self.progress_bar = widgets.IntProgress( 32 | value=0, 33 | step=1, 34 | description='Loading', 35 | orientation='horizontal' 36 | ) 37 | display(self.progress_bar) 38 | self.dns_graph = dg.load_dns_records_graph( 39 | os.path.join(measurements_dir, "dns_records.json"), 40 | self.update_progress) 41 | self.progress_bar.bar_style = "success" 42 | 43 | def domain_app(self, domain): 44 | return DomainApp(self.dns_graph, domain) 45 | 46 | def update_progress(self, done, total): 47 | self.progress_bar.max = total 48 | self.progress_bar.value = done 49 | 50 | 51 | def _truncate(text: str, max_len: int) -> str: 52 | """Truncates the text to the given length. 53 | 54 | Adds a trailing elipsis if text gets truncated. 55 | """ 56 | if len(text) > max_len: 57 | return text[:max_len - 1] + "…" 58 | return text 59 | 60 | 61 | class DomainApp: 62 | def __init__(self, dns_graph, domain): 63 | self.domain = domain 64 | self.domain_graph = dg.domain_view(dns_graph, self.domain) 65 | self.classifier = dc.EdgeClassifier(self.domain_graph) 66 | 67 | def display_graph(self, country=None): 68 | pyplot.figure(tight_layout=dict(pad=0)) 69 | pyplot.axis("off") 70 | domain_graph = self.domain_graph 71 | if country: 72 | domain_graph = dg.country_view(domain_graph, country) 73 | dc.draw_graph(self.classifier.class_graph.edge_subgraph( 74 | domain_graph.edges())) 75 | pyplot.show() 76 | 77 | def get_ips(self, net): 78 | ips = set() 79 | for _, _, record in self.domain_graph.in_edges(net, data="record"): 80 | if hasattr(record.data, "ip"): 81 | ips.add(str(record.data.ip)) 82 | return ips 83 | 84 | async def tls_verify_unknowns(self): 85 | validator = domain_ip_validator.DomainIpValidator() 86 | # Try short domains first: they usually validate CNAMES, which tend to be longer. 87 | for domain, target in sorted(self.classifier.class_graph.edges(), key=lambda e: (len(e[0]), e[1])): 88 | if self.classifier.get_class(domain, target) != dc.EdgeClass.UNKNOWN: 89 | continue 90 | try: 91 | ipaddress.ip_network(target) 92 | except (ipaddress.AddressValueError, ValueError): 93 | continue 94 | net = target 95 | print("Checking IPs for {} - {}".format(domain, net)) 96 | for ip in list(self.get_ips(net))[:2]: 97 | print(" Validating {}: ".format(ip), end="") 98 | try: 99 | await validator.validate_ip(domain, ip) 100 | print("VALID") 101 | self.classifier.add_good_edge( 102 | domain, net, "Pass TLS validation") 103 | break 104 | except Exception as e: 105 | print(_truncate(repr(e), 200)) 106 | -------------------------------------------------------------------------------- /netanalysis/dns/analysis/analyze_dns.md: -------------------------------------------------------------------------------- 1 | # Analyze DNS Queries 2 | 3 | In this example, we will fetch measurements for `www.youtube.com` from OONI and analyze them. 4 | 5 | ## Pre-requisites 6 | 7 | [Set up your Python environment](../../python_env.md) 8 | 9 | ## Fetch measurements 10 | 11 | Run 12 | 13 | ``` 14 | time python -m netanalysis.ooni.fetch_measurements --debug --output_dir=ooni_data --country=* --url=www.youtube.com --num_measurements=1000 15 | ``` 16 | 17 | This will take in the order of 10 minutes (the OONI API is not designed for batch processing). Measurements will be written as files in `ooni_data///`. 18 | 19 | ## Convert to DNS resource records 20 | 21 | Run 22 | ``` 23 | time python -m netanalysis.ooni.measurements_to_dns_records --ooni_measurements_dir=ooni_data/ 24 | ``` 25 | 26 | This will create `ooni_data/dns_records.json` with all the DNS records. 27 | 28 | ## Analyze data 29 | 30 | Run 31 | 32 | ``` 33 | .venv/bin/jupyter notebook 34 | ``` 35 | 36 | And open the Jupyter Notebook `notebooks/netanalysis/dns/analysis/DomainAnalysis.ipynb`. 37 | 38 | That notebook will allow you to explore the DNS data and analyze possible interference. 39 | -------------------------------------------------------------------------------- /netanalysis/dns/analysis/analyze_domain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2018 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # pylint: disable=yield-inside-async-function 18 | 19 | # Analysis ideas: 20 | # 21 | # - Ignore failed resolutions in control 22 | # 23 | # POSITIVE SIGNALS 24 | # - Use HTTPS success 25 | # - Create nested resolution whitelist with CNAMEs. Load from all files. 26 | # - Load balancers are likely not censored results. Signs: 27 | # - ttl=0 28 | # - Multiple source domains 29 | # 30 | # NEGATIVE SIGNALS 31 | # - Identify blackholes. Look at all data. Signs: 32 | # - Result AS is TRANSIT_ACCESS and in the client country 33 | # - Multiple source domains 34 | 35 | import argparse 36 | from collections import Counter, defaultdict 37 | from enum import Enum 38 | import glob 39 | import ipaddress 40 | import logging 41 | import os.path 42 | import socket 43 | import sys 44 | from typing import Dict, List 45 | from urllib.parse import urlparse 46 | 47 | import iso3166 48 | import matplotlib.pyplot as pyplot 49 | import networkx 50 | import ujson as json 51 | 52 | import netanalysis.ip.simple_autonomous_system as sas 53 | import netanalysis.ip.model as autonomous_system 54 | 55 | 56 | class DnsResolution: 57 | def __init__(self, measurement, country=None, resolver_ip=None, client_as=None, time=None, url=None): 58 | self.measurement = measurement 59 | self.country = country or "ZZ" 60 | self.resolver_ip = resolver_ip 61 | self.client_as = client_as 62 | self.time = time 63 | self.cnames = [] 64 | self.ips = [] 65 | self.url = url # For debugging 66 | 67 | def __repr__(self): 68 | return "DnsResolution(%s)" % repr(self.__dict__) 69 | 70 | 71 | def path_get(d, path): 72 | roots = [d] 73 | for step in path: 74 | next_roots = [] 75 | for root in roots: 76 | if type(root) != dict: 77 | break 78 | if step not in root: 79 | continue 80 | value = root[step] 81 | if type(value) == list: 82 | next_roots.extend(value) 83 | else: 84 | next_roots.append(value) 85 | roots = next_roots 86 | return roots 87 | 88 | 89 | def get_dns_results(as_repo: autonomous_system.AsRepository, 90 | measurements: List[Dict]) -> List[DnsResolution]: 91 | dns_results = [] 92 | for m in measurements: 93 | time = m.get("measurement_start_time") 94 | country = m.get("probe_cc") 95 | url = urlparse(m.get("input")) 96 | client_asn = int(m.get("probe_asn")[2:]) 97 | client_as = as_repo.get_as(client_asn) 98 | resolver_ip_str = path_get(m, ["test_keys", "client_resolver"])[0] 99 | resolver_ip = ipaddress.ip_address(resolver_ip_str) if resolver_ip_str else None 100 | for measurement in path_get(m, ["test_keys", "queries"]): 101 | dns_resolution = DnsResolution(m, country, resolver_ip, client_as, time, url) 102 | dns_resolution.cnames.append(measurement.get("hostname")) 103 | for answer in measurement.get("answers"): 104 | cname = answer.get("hostname") 105 | if cname: 106 | dns_resolution.cnames.append(cname) 107 | else: 108 | ip_str = answer.get("ipv4") or answer.get("ipv6") 109 | if ip_str: 110 | try: 111 | dns_resolution.ips.append(ipaddress.ip_address(ip_str)) 112 | except ValueError: 113 | logging.warning("Measurement %s: invalid IP answer %s", m["id"], ip_str) 114 | dns_results.append(dns_resolution) 115 | return dns_results 116 | 117 | 118 | def get_control_resolutions(measurements): 119 | control_resolutions = [] 120 | for m in measurements: 121 | resolution = DnsResolution(m, time=m.get("measurement_start_time")) 122 | resolution.cnames = [urlparse(m.get("input")).hostname] 123 | for ip_str in path_get(m, ["test_keys", "control", "dns", "addrs"]): 124 | try: 125 | resolution.ips.append(ipaddress.ip_address(ip_str)) 126 | except ValueError: 127 | resolution.cnames.append(ip_str) 128 | if resolution.ips: 129 | control_resolutions.append(resolution) 130 | return control_resolutions 131 | 132 | 133 | def count_resolutions(dns_results): 134 | edges = Counter() 135 | for resolution in dns_results: 136 | last_cname = resolution.cnames[0] 137 | for cname in resolution.cnames: 138 | edges[(last_cname, cname)] += 1 139 | last_cname = cname 140 | for ip_address in resolution.ips or [""]: 141 | edges[(last_cname, ip_address)] += 1 142 | return edges 143 | 144 | 145 | def get_ips(dns_resolutions): 146 | ips = set() 147 | for resolution in dns_resolutions: 148 | ips.update(resolution.ips) 149 | return ips 150 | 151 | 152 | class DnsResolutionClassification(Enum): 153 | UNKNOWN = 0 154 | FREE = 1 155 | CENSORED = 2 156 | EMPTY = 3 157 | 158 | 159 | def is_success_http_code(http_code): 160 | return 200 <= http_code and http_code <= 399 161 | 162 | 163 | class DnsResolutionClassifier: 164 | def __init__(self) -> None: 165 | self._good_ips = set() 166 | 167 | def _get_ip_key(self, ip): 168 | return ipaddress.ip_network(ip).supernet(new_prefix=21) 169 | 170 | def add_good_resolution(self, resolution: DnsResolution): 171 | for ip in resolution.ips: 172 | self._good_ips.add(self._get_ip_key(ip)) 173 | 174 | def classify_resolutions(self, resolutions: List[DnsResolution]): 175 | classifications = [DnsResolutionClassification.UNKNOWN] * len(resolutions) 176 | base_good_ips = 0 177 | while base_good_ips < len(self._good_ips): 178 | base_good_ips = len(self._good_ips) 179 | for ri, resolution in enumerate(resolutions): 180 | if classifications[ri] != DnsResolutionClassification.UNKNOWN: 181 | continue 182 | if not resolution.ips: 183 | classifications[ri] = DnsResolutionClassification.EMPTY 184 | continue 185 | ip_keys = set(self._get_ip_key(ip) for ip in resolution.ips) 186 | for ip_key in ip_keys: 187 | if ip_key in self._good_ips: 188 | self._good_ips.update(ip_keys) 189 | classifications[ri] = DnsResolutionClassification.FREE 190 | break 191 | # If resolution is good, consider all ips good. 192 | if classifications[ri] == DnsResolutionClassification.FREE: 193 | self._good_ips.update(ip_keys) 194 | continue 195 | 196 | for ip in resolution.ips: 197 | if not ip.is_global: 198 | classifications[ri] = DnsResolutionClassification.CENSORED 199 | break 200 | print("Good IPs: %s" % self._good_ips) 201 | return classifications 202 | 203 | 204 | def group_by(sequence, get_key): 205 | result = defaultdict(list) 206 | for item in sequence: 207 | result[get_key(item)].append(item) 208 | return result 209 | 210 | 211 | def make_resolver_key(as_repo, resolution): 212 | resolver_as = as_repo.get_as_for_ip(resolution.resolver_ip) 213 | if resolver_as == resolution.client_as: 214 | return "ISP" 215 | else: 216 | return resolver_as.org.name or resolver_as.org.id 217 | 218 | 219 | def as_str(asys): 220 | org = asys.org.name or asys.org.id 221 | if org: 222 | return "%s (%s), Org: '%s' from %s" % (asys.name, asys.type.name, org[:20], asys.org.country) 223 | else: 224 | return asys.name 225 | 226 | 227 | # Cache for ip -> hostname resolutions 228 | _IP_NAMES: Dict[str, str] = {} 229 | 230 | 231 | def resolve_ip(ip): 232 | hostname = _IP_NAMES.get(ip.compressed, "") 233 | if hostname == "": 234 | try: 235 | hostname = socket.gethostbyaddr(ip.compressed)[0] 236 | except socket.herror: 237 | hostname = None 238 | _IP_NAMES[ip.compressed] = hostname 239 | return hostname 240 | 241 | 242 | def show_resolutions_graph(as_repo, domain, control_resolutions, dns_resolutions): 243 | graph = networkx.DiGraph() 244 | cnames = set() 245 | ip_nets = set() 246 | ases = set() 247 | bad_edges = set() 248 | bad_nodes = set() 249 | good_edges = set() 250 | good_nodes = set() 251 | edge_countries = defaultdict(set) 252 | in_control = True 253 | for resolutions in [control_resolutions or [], dns_resolutions]: 254 | for resolution in resolutions: 255 | country = resolution.measurement["probe_cc"] 256 | last_cname = resolution.cnames[0] 257 | cnames.add(last_cname) 258 | for cname in resolution.cnames: 259 | edge = last_cname, cname 260 | graph.add_edge(*edge) 261 | edge_countries[edge].add(country) 262 | if in_control: 263 | good_edges.add(edge) 264 | good_nodes.add(cname) 265 | last_cname = cname 266 | cnames.add(cname) 267 | for ip_address in resolution.ips or [None]: 268 | if ip_address: 269 | ip_net = ipaddress.ip_network(ip_address).supernet(new_prefix=22) 270 | asys = as_repo.get_as_for_ip(ip_address) 271 | as_str = asys.name or str(asys.id) 272 | ases.add(as_str) 273 | graph.add_edge(ip_net, as_str) 274 | if not ip_address.is_global: 275 | bad_edges.add((last_cname, ip_net)) 276 | bad_nodes.add(ip_net) 277 | else: 278 | ip_net = "" 279 | ip_nets.add(ip_net) 280 | edge = last_cname, ip_net 281 | graph.add_edge(*edge) 282 | edge_countries[edge].add(country) 283 | if in_control: 284 | good_edges.add(edge) 285 | good_nodes.add(ip_net) 286 | in_control = False 287 | 288 | nodes_pos = networkx.spring_layout(graph) 289 | min_x = min(x for x, _ in nodes_pos.values()) 290 | max_x = max(x for x, _ in nodes_pos.values()) 291 | range_x = max_x - min_x 292 | for node, pos in list(nodes_pos.items()): 293 | if isinstance(node, (ipaddress.IPv4Network, ipaddress.IPv6Network)): 294 | nodes_pos[node] = (min_x + range_x * 0.5 + (pos[0] - min_x) * 0.3, pos[1]) 295 | else: 296 | nodes_pos[node] = (min_x + range_x * 0.1 + (pos[0] - min_x) * 0.3, pos[1]) 297 | nodes_pos[domain] = (min_x, nodes_pos[domain][1]) 298 | for asys in ases: 299 | nodes_pos[asys] = (max_x, nodes_pos[asys][1]) 300 | networkx.draw_networkx_nodes(graph, nodelist=cnames, pos=nodes_pos, node_color="b") 301 | networkx.draw_networkx_nodes(graph, nodelist=ip_nets - bad_nodes, pos=nodes_pos, node_color="gray") 302 | networkx.draw_networkx_labels(graph, pos=nodes_pos, font_size=8) 303 | networkx.draw_networkx_edges(graph, pos=nodes_pos, alpha=0.25) 304 | edge_labels = dict((key, " ".join(countries) if len(countries) <= 3 else "*") for key, countries in edge_countries.items()) 305 | networkx.draw_networkx_edge_labels(graph, edge_labels=edge_labels, pos=nodes_pos, alpha=0.5, font_size=8, label_pos=0.2) 306 | networkx.draw_networkx_edges(graph, edgelist=good_edges, pos=nodes_pos, alpha=0.5, edge_color="g") 307 | networkx.draw_networkx_nodes(graph, nodelist=good_nodes, pos=nodes_pos, node_color="g") 308 | networkx.draw_networkx_edges(graph, edgelist=bad_edges, pos=nodes_pos, alpha=0.5, edge_color="r") 309 | networkx.draw_networkx_nodes(graph, nodelist=bad_nodes, pos=nodes_pos, node_color="r") 310 | pyplot.show() 311 | 312 | 313 | def main(args): 314 | if args.debug: 315 | logging.basicConfig(level=logging.DEBUG) 316 | 317 | measurements = [] 318 | for filename in glob.iglob(os.path.join(args.measurements_dir, args.domain, "*", "*")): 319 | with open(filename) as file: 320 | measurements.append(json.load(file)) 321 | 322 | as_repo = sas.create_default_as_repo() 323 | 324 | classifier = DnsResolutionClassifier() 325 | control_resolutions = get_control_resolutions(measurements) 326 | for resolution in control_resolutions: 327 | classifier.add_good_resolution(resolution) 328 | 329 | print("\nCONTROL") 330 | for resolution, count in count_resolutions(control_resolutions).most_common(): 331 | print("%s -> %s: %d" % (resolution[0], resolution[1], count)) 332 | 333 | dns_resolutions = get_dns_results(as_repo, measurements) 334 | show_resolutions_graph(as_repo, args.domain, control_resolutions, dns_resolutions) 335 | 336 | print("\nTESTS") 337 | classified_resolutions = zip(dns_resolutions, 338 | classifier.classify_resolutions(dns_resolutions)) 339 | 340 | for country_code, country_classifications in group_by(classified_resolutions, lambda e: e[0].country).items(): 341 | try: 342 | country_name = iso3166.countries.get(country_code).name 343 | except KeyError: 344 | country_name = "Unknown" 345 | print("\n=============\n= %s (%s)\n=============" % (country_name, country_code)) 346 | country_count = len(country_classifications) 347 | grouped_country_classifications = group_by(country_classifications, lambda e: e[1]) 348 | for classification, entries in grouped_country_classifications.items(): 349 | class_count = len(entries) 350 | prefix = "All " if class_count == country_count else "" 351 | print(" %s%s: %d/%d" % (prefix, classification.name.lower(), class_count, country_count)) 352 | # if len(grouped_country_classifications[DnsResolutionClassification.FREE]) == country_count: 353 | # continue 354 | 355 | print("\n By Resolver:") 356 | for resolver_key, resolver_classifications in group_by(country_classifications, 357 | lambda e: make_resolver_key(as_repo, e[0])).items(): 358 | print(" - %s:" % resolver_key) 359 | resolver_count = len(resolver_classifications) 360 | for classification, entries in group_by(resolver_classifications, lambda e: e[1]).items(): 361 | class_count = len(entries) 362 | prefix = "All " if class_count == resolver_count else "" 363 | print(" %s%s: %d/%d" % (prefix, classification.name.lower(), class_count, resolver_count)) 364 | 365 | for classification, entries in grouped_country_classifications.items(): 366 | if classification == DnsResolutionClassification.EMPTY or not entries: 367 | continue 368 | print("\n %s resolutions:" % classification.name) 369 | displayed = set() 370 | for resolution, _ in entries: 371 | display_str = ",\n ".join(["%s (%s)" % (resolve_ip(ip) or ip, as_str( 372 | as_repo.get_as_for_ip(ip))) for ip in sorted(resolution.ips)]) 373 | if display_str in displayed: 374 | continue 375 | print(" - [%s] %s\n => %s" % (display_str, resolution.url.geturl(), 376 | path_get(resolution.measurement, ["test_keys", "requests", "failure"]))) 377 | displayed.add(display_str) 378 | # print(json.dumps(resolution.measurement, indent=4, sort_keys=True)) 379 | 380 | 381 | if __name__ == "__main__": 382 | parser = argparse.ArgumentParser("Analyze DNS measurements from OONI") 383 | parser.add_argument("--measurements_dir", type=str, required=True) 384 | parser.add_argument("--domain", type=str, required=True) 385 | parser.add_argument("--debug", action="store_true") 386 | sys.exit(main(parser.parse_args())) 387 | -------------------------------------------------------------------------------- /netanalysis/dns/analysis/classifier.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Jigsaw Operations LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from collections import namedtuple 16 | from enum import Enum 17 | from itertools import chain 18 | 19 | import networkx as nx 20 | 21 | 22 | class EdgeClass(Enum): 23 | UNKNOWN = 0 24 | GOOD = 1 25 | BAD = 2 26 | 27 | 28 | Evaluation = namedtuple("Evaluation", ["classification", "reason"]) 29 | 30 | 31 | def edge_class(class_graph: nx.DiGraph, u: str, v: str) -> EdgeClass: 32 | try: 33 | return class_graph[u][v]["eval"].classification 34 | except KeyError: 35 | return EdgeClass.UNKNOWN 36 | 37 | 38 | def good_predecessors(class_graph: nx.DiGraph, node: str): 39 | """All predecessors following GOOD edges only.""" 40 | for p in class_graph.predecessors(node): 41 | if edge_class(class_graph, p, node) == EdgeClass.GOOD: 42 | yield p 43 | 44 | 45 | def good_successors(class_graph: nx.DiGraph, node): 46 | """All successors following GOOD edges only.""" 47 | for s in class_graph.successors(node): 48 | if edge_class(class_graph, node, s) == EdgeClass.GOOD: 49 | yield s 50 | 51 | 52 | class EdgeClassifier: 53 | def __init__(self, multi_graph: nx.MultiDiGraph) -> None: 54 | self.class_graph = nx.transitive_closure( 55 | nx.DiGraph(multi_graph.edges())) 56 | for _u, _v, data in self.class_graph.edges(data=True): 57 | data["eval"] = Evaluation(EdgeClass.UNKNOWN, None) 58 | 59 | for u, v, data in multi_graph.edges(data=True): 60 | if self.get_class(u, v) != EdgeClass.UNKNOWN: 61 | continue 62 | measurement = data["measurement"] 63 | if measurement.trust_reason: 64 | self.add_good_edge(u, v, measurement.trust_reason) 65 | else: 66 | ip = getattr(data["record"].data, "ip", None) 67 | if ip and not ip.is_global: 68 | self.add_bad_edge(u, v, "Non global IP") 69 | 70 | def get_class(self, u: str, v: str) -> EdgeClass: 71 | return edge_class(self.class_graph, u, v) 72 | 73 | def mark_all_paths_good(self, u: str, v: str): 74 | """Mark all paths between u and v as GOOD. 75 | Assumes the input graph has all the edges from its transitive closure. 76 | """ 77 | for s in self.class_graph.successors(u): 78 | if not self.class_graph.has_edge(s, v): 79 | continue 80 | self.add_good_edge(u, s, "On path for GOOD pair (%s, %s)" % (u, v)) 81 | self.add_good_edge(s, v, "On path for GOOD pair (%s, %s)" % (u, v)) 82 | 83 | def mark_new_connections_good(self, u, v): 84 | """Add new good connections from adding the GOOD edge u, v. 85 | Assumes the class_graph has all the edges from its transitive closure. 86 | """ 87 | for p in chain([u], good_predecessors(self.class_graph, u)): 88 | path = [p] 89 | if p != u: 90 | path.append(u) 91 | for s in chain([v], good_successors(self.class_graph, v)): 92 | path.append(v) 93 | if s != v: 94 | path.append(s) 95 | self.add_good_edge( 96 | p, s, "Path (%s) is GOOD" % ", ".join(path)) 97 | 98 | def add_good_edge(self, u: str, v: str, reason: str): 99 | if u == v or self.get_class(u, v) != EdgeClass.UNKNOWN: 100 | return 101 | self.class_graph[u][v]["eval"] = Evaluation(EdgeClass.GOOD, reason) 102 | self.mark_all_paths_good(u, v) 103 | self.mark_new_connections_good(u, v) 104 | # TODO: Mark all IP edges as GOOD if measurement is GOOD 105 | # Can do that by adding an extra node per measurement and 106 | # (ip, measurement_id) edges for last step 107 | 108 | def add_bad_edge(self, u: str, v: str, reason: str): 109 | if self.get_class(u, v) != EdgeClass.UNKNOWN: 110 | return 111 | self.class_graph[u][v]["eval"] = Evaluation(EdgeClass.BAD, reason) 112 | 113 | def unknown_edges(self): 114 | for u, v, edge_eval in self.class_graph.edges(data="eval"): 115 | if edge_eval.classification == EdgeClass.UNKNOWN: 116 | yield u, v 117 | 118 | 119 | def classify_edges(multi_graph: nx.MultiDiGraph) -> nx.DiGraph: 120 | return EdgeClassifier(multi_graph).class_graph 121 | 122 | 123 | def _get_edge_class(edge_data): 124 | try: 125 | return edge_data["eval"].classification 126 | except KeyError: 127 | return EdgeClass.UNKNOWN 128 | 129 | 130 | def draw_graph(graph: nx.DiGraph): 131 | good_edges = set((u, v) for u, v, data in graph.edges( 132 | data=True) if _get_edge_class(data) == EdgeClass.GOOD) 133 | good_nodes = set(v for (u, v) in good_edges) 134 | bad_edges = set((u, v) for u, v, data in graph.edges( 135 | data=True) if _get_edge_class(data) == EdgeClass.BAD) 136 | bad_nodes = set(v for (u, v) in bad_edges) 137 | 138 | nodes_pos = nx.spring_layout(graph) 139 | nx.draw_networkx_nodes(graph, pos=nodes_pos, alpha=0.6, node_color="gray") 140 | nx.draw_networkx_nodes(graph, pos=nodes_pos, alpha=0.8, nodelist=good_nodes, node_color="g") 141 | nx.draw_networkx_nodes(graph, pos=nodes_pos, alpha=0.8, nodelist=bad_nodes, node_color="r") 142 | nx.draw_networkx_labels(graph, pos=nodes_pos, font_size=8) 143 | nx.draw_networkx_edges(graph, pos=nodes_pos, alpha=0.25) 144 | nx.draw_networkx_edges(graph, edgelist=good_edges, 145 | pos=nodes_pos, alpha=0.5, width=4, edge_color="g") 146 | nx.draw_networkx_edges(graph, edgelist=bad_edges, 147 | pos=nodes_pos, alpha=0.5, width=4, edge_color="r") 148 | -------------------------------------------------------------------------------- /netanalysis/dns/analysis/graph.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Jigsaw Operations LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ipaddress 16 | import logging 17 | import os 18 | 19 | import networkx as nx 20 | import ujson as json 21 | 22 | import netanalysis.dns.data.serialization as ds 23 | 24 | 25 | def _get_edge_target(data): 26 | if hasattr(data, "cname"): 27 | return data.cname.lower() 28 | if hasattr(data, "ip"): 29 | net = ipaddress.ip_network(data.ip).supernet(new_prefix=24) 30 | return net.compressed 31 | 32 | 33 | def load_dns_records_graph(dns_measurements_filename: str, 34 | update_progress=lambda done, total: None) -> nx.MultiDiGraph: 35 | graph = nx.MultiDiGraph() 36 | file_size = os.stat(dns_measurements_filename).st_size 37 | update_progress(0, file_size) 38 | done_bytes = 0 39 | done_step = 0 40 | with open(dns_measurements_filename) as measurements_file: 41 | for line in measurements_file: 42 | try: 43 | measurement = ds.measurement_from_json(json.loads(line)) 44 | for record in measurement.records: 45 | source = record.name.lower() 46 | target = _get_edge_target(record.data) 47 | if not target: 48 | raise ValueError("No record target for DnsMeasurement: %s" % measurement) 49 | graph.add_edge(source, target, None, record=record, measurement=measurement) 50 | except Exception as e: 51 | logging.error("Failed to process measurement:\n%s", line) 52 | raise e 53 | done_bytes += len(line) 54 | new_step = int(done_bytes * 100 / file_size / 5) 55 | if new_step != done_step: 56 | update_progress(done_bytes, file_size) 57 | done_step = new_step 58 | return graph 59 | 60 | 61 | def domain_view(multi_graph: nx.MultiDiGraph, root_domain: str) -> nx.MultiDiGraph: 62 | """Returns the subgraph rooted at the given domain.""" 63 | return multi_graph.subgraph(nx.dfs_preorder_nodes(multi_graph, root_domain)) 64 | 65 | 66 | def country_view(multi_graph: nx.MultiDiGraph, client_country: str) -> nx.MultiDiGraph: 67 | """Returns a view of the edges restricted to the given client country.""" 68 | country_edges = [(u, v, k) for u, v, k, measurement in multi_graph.edges( 69 | keys=True, data="measurement") if measurement.client_country == client_country] 70 | return multi_graph.edge_subgraph(country_edges) 71 | -------------------------------------------------------------------------------- /netanalysis/dns/analysis/ip_info_widget.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2018 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import asyncio 18 | import ipaddress 19 | import pprint 20 | 21 | import ipywidgets as widgets 22 | 23 | from netanalysis.ip import ip_info as ii 24 | from netanalysis.ip import model 25 | from netanalysis.tls import domain_ip_validator 26 | 27 | 28 | VALIDATOR = domain_ip_validator.DomainIpValidator() 29 | 30 | 31 | def create_ip_info_widget(ip_info: ii.IpInfoService): 32 | ip_field = widgets.Text(placeholder="Enter ip address", description="IP") 33 | get_btn = widgets.Button(description="Get info") 34 | output = widgets.Output() 35 | 36 | def show_ip_info(_): 37 | output.clear_output() 38 | if not ip_field.value: 39 | return 40 | try: 41 | ip_address = ipaddress.ip_address(ip_field.value) 42 | except ValueError: 43 | with output: 44 | print("Invalid IP: %s" % ip_field.value) 45 | return 46 | asys: model.AutonomousSystem = ip_info.get_as(ip_address) 47 | with output: 48 | print("ASN: %d (%s)" % (asys.id, asys.name)) 49 | # AS Type is is experimental and outdated data. 50 | print("Type: %s" % asys.type.name) 51 | print("Org: %s (country: %s, name: %s)" % 52 | (asys.org.id, asys.org.country, asys.org.name)) 53 | if ip_address.is_global: 54 | hostname = ip_info.resolve_ip(ip_address) 55 | if hostname: 56 | print("Hostname: %s" % hostname) 57 | else: 58 | print("IP is not global") 59 | try: 60 | cert = asyncio.get_event_loop().run_until_complete( 61 | VALIDATOR.get_cert(None, ip_address)) 62 | if cert: 63 | print("TLS Certificate:\n%s" % 64 | pprint.pformat(cert, width=100, compact=True)) 65 | except Exception as e: 66 | print("TLS Certificate: %s" % repr(e)) 67 | get_btn.on_click(show_ip_info) 68 | return widgets.VBox([widgets.HBox([ip_field, get_btn]), output]) 69 | -------------------------------------------------------------------------------- /netanalysis/dns/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/dns/data/__init__.py -------------------------------------------------------------------------------- /netanalysis/dns/data/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Jigsaw Operations LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import datetime 16 | from ipaddress import ip_address, IPv4Address, IPv6Address 17 | from typing import List, Union 18 | 19 | 20 | class RecordData: 21 | """Represents the data in a DNS Resource Record.""" 22 | 23 | def __repr__(self): 24 | return "%s(%s)" % (self.__class__, str(self.__dict__)) 25 | 26 | 27 | class IpAddressData(RecordData): 28 | """Data for Resource Record type A or AAAA""" 29 | 30 | def __init__(self, ip_str: str) -> None: 31 | self._ip = ip_address(ip_str) 32 | 33 | @property 34 | def ip(self): 35 | return self._ip 36 | 37 | 38 | class CnameData(RecordData): 39 | """Data for Resource Record type CNAME""" 40 | 41 | def __init__(self, cname: str) -> None: 42 | self._cname = cname 43 | 44 | @property 45 | def cname(self): 46 | return self._cname 47 | 48 | 49 | class ResourceRecord: 50 | def __init__(self, name: str, data: RecordData, ttl: datetime.timedelta = None) -> None: 51 | if not name: 52 | raise ValueError("ResourceRecord requires name") 53 | self.name = name 54 | self.data = data 55 | self.ttl = ttl 56 | if not isinstance(ttl, (type(None), datetime.timedelta)): 57 | raise ValueError("ttl must be of type datetime.timedelta. Found type %s, value %s" % ( 58 | type(ttl), repr(ttl))) 59 | 60 | def __repr__(self): 61 | return "%s(%s)" % (self.__class__, str(self.__dict__)) 62 | 63 | 64 | class DnsMeasurement: 65 | def __init__(self, 66 | measurement_id: str, 67 | time: datetime.datetime, 68 | records: List[ResourceRecord], 69 | resolver_ip: Union[IPv4Address, IPv6Address] = None, 70 | client_asn: int = None, 71 | client_country: str = None, 72 | provenance: str = None, 73 | trust_reason: str = None) -> None: 74 | self.measurement_id = measurement_id 75 | self.time = time 76 | self.records = records 77 | self.resolver_ip = resolver_ip 78 | self.client_asn = client_asn 79 | self.client_country = client_country 80 | self.provenance = provenance 81 | self.trust_reason = trust_reason 82 | 83 | def __repr__(self): 84 | return "DnsMeasurement(%s)" % str(self.__dict__) 85 | -------------------------------------------------------------------------------- /netanalysis/dns/data/serialization.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Jigsaw Operations LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import datetime 16 | from functools import singledispatch 17 | from typing import Dict, List 18 | 19 | import ujson as json 20 | 21 | from . import model 22 | 23 | 24 | @singledispatch 25 | def to_json(value): 26 | return value 27 | 28 | 29 | @to_json.register(List) 30 | def _(value): 31 | return [to_json(e) for e in value] 32 | 33 | 34 | @to_json.register(model.IpAddressData) 35 | def _(data): 36 | return {"ip": str(data.ip)} 37 | 38 | 39 | @to_json.register(model.CnameData) 40 | def _(data): 41 | return {"cname": data.cname} 42 | 43 | 44 | @to_json.register(model.ResourceRecord) 45 | def _(record): 46 | query_json = {} 47 | for field in ["name", "data", "ttl"]: 48 | value = getattr(record, field) 49 | if value is None: 50 | continue 51 | if field == "ttl": 52 | value = value.seconds 53 | query_json[field] = to_json(value) 54 | return query_json 55 | 56 | 57 | @to_json.register(model.DnsMeasurement) 58 | def _(measurement): 59 | measurement_json = {} 60 | for key, value in measurement.__dict__.items(): 61 | if value is None: 62 | continue 63 | if key == "resolver_ip": 64 | value = str(value) 65 | measurement_json[key] = to_json(value) 66 | return measurement_json 67 | 68 | 69 | def record_data_from_json(data_json: Dict) -> model.RecordData: 70 | if "ip" in data_json: 71 | return model.IpAddressData(data_json["ip"]) 72 | elif "cname" in data_json: 73 | return model.CnameData(data_json["cname"]) 74 | else: 75 | raise ValueError("Invalid RecordData json: %s" % 76 | json.dumps(data_json)) 77 | 78 | 79 | def record_from_json(record_json: Dict) -> model.ResourceRecord: 80 | params = {} 81 | for key, value in record_json.items(): 82 | if key == "data": 83 | value = record_data_from_json(value) 84 | elif key == "ttl": 85 | value = datetime.timedelta(seconds=value) 86 | if value is not None: 87 | params[key] = value 88 | return model.ResourceRecord(**params) 89 | 90 | 91 | def measurement_from_json(measurement_json: Dict) -> model.DnsMeasurement: 92 | params = {} 93 | for key, value in measurement_json.items(): 94 | if key == "time": 95 | value = datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%S") 96 | elif key == "records": 97 | value = [record_from_json(r) for r in value] 98 | if value is not None: 99 | params[key] = value 100 | return model.DnsMeasurement(**params) 101 | -------------------------------------------------------------------------------- /netanalysis/dns/google/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/dns/google/__init__.py -------------------------------------------------------------------------------- /netanalysis/dns/google/get_google_dns_locations.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | # 3 | # Copyright 2017 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Code from https://developers.google.com/speed/public-dns/faq#locations 18 | 19 | IFS="\"$IFS" 20 | #for LOC in $(dig -t TXT +short locations.publicdns.goog. @8.8.8.8) 21 | for LOC in $(dig -t TXT +short locations.publicdns.goog.) 22 | do 23 | case $LOC in 24 | '') : ;; 25 | *.*|*:*) printf '%s ' ${LOC} ;; 26 | *) printf '%s\n' ${LOC} ;; 27 | esac 28 | done -------------------------------------------------------------------------------- /netanalysis/dns/google/google_dns.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2017 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import ipaddress 18 | import os.path 19 | 20 | 21 | class Server: # pylint: disable=too-few-public-methods 22 | def __init__(self, ip_address, location_id: str) -> None: 23 | self.ip_address = ip_address 24 | self.location_id = location_id 25 | 26 | 27 | class GoogleDns: 28 | def __init__(self): 29 | self.networks = [] 30 | 31 | def add_network(self, ip_network, location_id): 32 | self.networks.append((ip_network, location_id)) 33 | 34 | def get_server(self, ip_address): 35 | for ip_network, location_id in self.networks: 36 | if ip_address in ip_network: 37 | return Server(ip_address, location_id) 38 | return None 39 | 40 | 41 | def create_google_dns_from_filename(filename): 42 | google_dns = GoogleDns() 43 | with open(filename) as data_file: 44 | for line in data_file: 45 | ip_address_str, location_id = line.strip().split() 46 | ip_network = ipaddress.ip_network(ip_address_str) 47 | google_dns.add_network(ip_network, location_id) 48 | return google_dns 49 | 50 | 51 | def create_default_google_dns(): 52 | filename = os.path.join(os.path.dirname(__file__), 53 | "google_dns_locations.txt") 54 | return create_google_dns_from_filename(filename) 55 | -------------------------------------------------------------------------------- /netanalysis/dns/google/google_dns_locations.txt: -------------------------------------------------------------------------------- 1 | 74.125.18.0/26 iad 2 | 74.125.18.64/26 iad 3 | 74.125.18.128/26 syd 4 | 74.125.18.192/26 lhr 5 | 74.125.19.0/24 mrn 6 | 74.125.41.0/24 tpe 7 | 74.125.42.0/24 atl 8 | 74.125.44.0/24 mrn 9 | 74.125.45.0/24 tul 10 | 74.125.46.0/24 lpp 11 | 74.125.47.0/24 bru 12 | 74.125.72.0/24 cbf 13 | 74.125.73.0/24 bru 14 | 74.125.74.0/24 lpp 15 | 74.125.75.0/24 chs 16 | 74.125.76.0/24 cbf 17 | 74.125.77.0/24 chs 18 | 74.125.79.0/24 lpp 19 | 74.125.80.0/24 dls 20 | 74.125.81.0/24 dub 21 | 74.125.92.0/24 mrn 22 | 74.125.112.0/24 lpp 23 | 74.125.113.0/24 cbf 24 | 74.125.115.0/24 tul 25 | 74.125.176.0/24 mrn 26 | 74.125.177.0/24 atl 27 | 74.125.178.0/24 atl 28 | 74.125.179.0/24 cbf 29 | 74.125.181.0/24 bru 30 | 74.125.182.0/24 cbf 31 | 74.125.183.0/24 cbf 32 | 74.125.184.0/24 chs 33 | 74.125.185.0/24 chs 34 | 74.125.186.0/24 dls 35 | 74.125.187.0/24 dls 36 | 74.125.190.0/24 sin 37 | 74.125.191.0/24 tul 38 | 172.217.32.0/26 lhr 39 | 172.217.32.64/26 lhr 40 | 172.217.32.128/26 sin 41 | 172.217.33.0/26 syd 42 | 172.217.33.64/26 syd 43 | 172.217.33.128/26 fra 44 | 172.217.33.192/26 fra 45 | 172.217.34.0/26 fra 46 | 172.217.34.64/26 bom 47 | 172.217.34.192/26 bom 48 | 172.217.35.0/24 gru 49 | 172.217.36.0/24 atl 50 | 172.217.37.0/24 gru 51 | 172.217.38.0/24 bom 52 | 172.217.39.0/24 atl 53 | 172.217.40.0/24 grq 54 | 172.217.41.0/24 grq 55 | 172.217.42.0/24 tpe 56 | 172.217.43.0/24 yul 57 | 172.217.44.0/24 yul 58 | 172.217.45.0/24 yul 59 | 172.217.47.0/24 sin 60 | 173.194.90.0/24 cbf 61 | 173.194.91.0/24 scl 62 | 173.194.93.0/24 tpe 63 | 173.194.94.0/24 cbf 64 | 173.194.95.0/24 tul 65 | 173.194.97.0/24 chs 66 | 173.194.98.0/24 lpp 67 | 173.194.99.0/24 tul 68 | 173.194.100.0/24 mrn 69 | 173.194.101.0/24 tul 70 | 173.194.102.0/24 atl 71 | 173.194.103.0/24 cbf 72 | 173.194.168.0/26 nrt 73 | 173.194.168.64/26 nrt 74 | 173.194.168.128/26 nrt 75 | 173.194.168.192/26 iad 76 | 173.194.169.0/24 grq 77 | 173.194.170.0/24 grq 78 | 173.194.171.0/24 tpe 79 | 2404:6800:4000::/48 bom 80 | 2404:6800:4003::/48 sin 81 | 2404:6800:4006::/48 syd 82 | 2404:6800:4008::/48 tpe 83 | 2404:6800:400b::/48 nrt 84 | 2607:f8b0:4001::/48 cbf 85 | 2607:f8b0:4002::/48 atl 86 | 2607:f8b0:4003::/48 tul 87 | 2607:f8b0:4004::/48 iad 88 | 2607:f8b0:400c::/48 chs 89 | 2607:f8b0:400d::/48 mrn 90 | 2607:f8b0:400e::/48 dls 91 | 2607:f8b0:4020::/48 yul 92 | 2800:3f0:4001::/48 gru 93 | 2800:3f0:4003::/48 scl 94 | 2a00:1450:4001::/48 fra 95 | 2a00:1450:4009::/48 lhr 96 | 2a00:1450:400b::/48 dub 97 | 2a00:1450:400c::/48 bru 98 | 2a00:1450:4010::/48 lpp 99 | 2a00:1450:4013::/48 grq 100 | -------------------------------------------------------------------------------- /netanalysis/dns/google/is_google_dns.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2017 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import argparse 18 | import ipaddress 19 | import sys 20 | 21 | from . import google_dns as gd 22 | 23 | 24 | def main(args): 25 | ip_address = args.ip_address[0] 26 | google_dns = gd.create_default_google_dns() 27 | server = google_dns.get_server(ip_address) 28 | if not server: 29 | print("%s is NOT a Google DNS server" % ip_address) 30 | return 1 31 | print("%s is a Google DNS server at %s" % (ip_address, server.location_id)) 32 | return 0 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser( 37 | description='Checks if the given ip address is a Google DNS one') 38 | parser.add_argument('ip_address', type=ipaddress.ip_address, 39 | nargs=1, help='The IP address to check') 40 | sys.exit(main(parser.parse_args())) 41 | -------------------------------------------------------------------------------- /netanalysis/dns/google/test_google_dns.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2017 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import ipaddress 18 | import unittest 19 | 20 | from . import google_dns as gdns 21 | 22 | 23 | class TestGoogleDns(unittest.TestCase): 24 | 25 | def test_default_servers(self): 26 | google_dns = gdns.create_default_google_dns() 27 | self.assertIsNone(google_dns.get_server( 28 | ipaddress.ip_address("201.249.215.0"))) 29 | server = google_dns.get_server(ipaddress.ip_address("74.125.80.128")) 30 | self.assertIsNotNone(server) 31 | self.assertEqual("dls", server.location_id) 32 | self.assertEqual("74.125.80.128", server.ip_address.compressed) 33 | 34 | 35 | if __name__ == '__main__': 36 | unittest.main() 37 | -------------------------------------------------------------------------------- /netanalysis/infrastructure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/infrastructure/__init__.py -------------------------------------------------------------------------------- /netanalysis/infrastructure/resources.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2018 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import os.path 18 | 19 | # Assumes this module is in cwm/infrastructure 20 | _PACKAGE_DIR = os.path.dirname(__file__) 21 | assert (_PACKAGE_DIR.split(os.path.sep)[-2:] == ["netanalysis", "infrastructure"]), \ 22 | "resources.py in invalid directory %s" % _PACKAGE_DIR 23 | 24 | _RESOURCES_ROOT = os.path.normpath(os.path.join(_PACKAGE_DIR, "..")) 25 | 26 | 27 | def resource_filename(resource_path: str) -> str: 28 | filesystem_path = os.path.join(*resource_path.split("/")) 29 | return os.path.join(_RESOURCES_ROOT, filesystem_path) 30 | -------------------------------------------------------------------------------- /netanalysis/ip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/ip/__init__.py -------------------------------------------------------------------------------- /netanalysis/ip/ip_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2018 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # TODO: 18 | # - Get SOA for PTR record 19 | # - Show city and country 20 | 21 | import argparse 22 | import asyncio 23 | import ipaddress 24 | import pprint 25 | import socket 26 | import sys 27 | from typing import Tuple 28 | 29 | import geoip2.database 30 | 31 | from netanalysis.tls import domain_ip_validator 32 | from netanalysis.infrastructure.resources import resource_filename 33 | 34 | from . import model 35 | from . import simple_autonomous_system as sas 36 | 37 | 38 | class IpInfoService: 39 | def __init__(self, as_repo: model.AsRepository, ip_to_asn, ip_to_country): 40 | self._as_repo = as_repo 41 | self._ip_to_asn = ip_to_asn 42 | self._ip_to_country = ip_to_country 43 | 44 | def get_as(self, ip: model.IpAddress) -> model.AutonomousSystem: 45 | try: 46 | asn = self._ip_to_asn.asn(ip.compressed).autonomous_system_number 47 | except Exception: 48 | asn = -1 49 | return self._as_repo.get_as(asn) 50 | 51 | def get_country(self, ip: model.IpAddress) -> Tuple[str, str]: 52 | "Returns country code and country name for the IP" 53 | # TODO: Consider exposing the confidence value 54 | try: 55 | country_record: geoip2.records.Country = self._ip_to_country.country( 56 | ip.compressed).country 57 | if not country_record: 58 | return ("ZZ", "Unknown") 59 | return (str(country_record.iso_code), str(country_record.name)) 60 | except Exception: 61 | return ("ZZ", "Unknown") 62 | 63 | def resolve_ip(self, ip: model.IpAddress) -> str: 64 | try: 65 | return socket.gethostbyaddr(ip.compressed)[0] 66 | except socket.herror: 67 | return None 68 | 69 | 70 | def create_default_ip_info_service() -> IpInfoService: 71 | as_repo = sas.create_default_as_repo() 72 | ip_asn = geoip2.database.Reader(resource_filename( 73 | "third_party/db-ip/dbip-asn-lite/dbip-asn-lite.mmdb")) 74 | ip_country = geoip2.database.Reader(resource_filename( 75 | "third_party/db-ip/dbip-country-lite/dbip-country-lite.mmdb")) 76 | return IpInfoService(as_repo, ip_asn, ip_country) 77 | 78 | 79 | def main(args): 80 | ip_info = create_default_ip_info_service() 81 | 82 | ip_address = args.ip_address[0] 83 | print("Country: %s (%s)" % ip_info.get_country(ip_address)) 84 | asys: model.AutonomousSystem = ip_info.get_as(ip_address) 85 | print("ASN: %d (%s)" % (asys.id, asys.name)) 86 | # AS Type is is experimental and outdated data. 87 | print("Type: %s" % asys.type.name) 88 | print("Org: %s (country: %s, name: %s)" % 89 | (asys.org.id, asys.org.country, asys.org.name)) 90 | if ip_address.is_global: 91 | hostname = ip_info.resolve_ip(ip_address) 92 | if hostname: 93 | print("Hostname: %s" % hostname) 94 | else: 95 | print("IP in not global") 96 | validator = domain_ip_validator.DomainIpValidator() 97 | try: 98 | cert = asyncio.get_event_loop().run_until_complete( 99 | validator.get_cert(None, ip_address)) 100 | if cert: 101 | print("TLS Certificate:\n%s" % 102 | pprint.pformat(cert, width=100, compact=True)) 103 | except Exception as e: 104 | print("TLS Certificate: %s" % repr(e)) 105 | 106 | 107 | if __name__ == "__main__": 108 | parser = argparse.ArgumentParser( 109 | description='Gets information about the given IP address') 110 | parser.add_argument('ip_address', type=ipaddress.ip_address, 111 | nargs=1, help='The IP address to get information fo') 112 | sys.exit(main(parser.parse_args())) 113 | -------------------------------------------------------------------------------- /netanalysis/ip/ip_info_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from ipaddress import ip_address as ip 4 | 5 | from . import ip_info as ii 6 | 7 | # Google DNS addresses will be stably assigned to Google's AS" 8 | GOOGLE_DNS_IP4_8888 = ip("8.8.8.8") 9 | GOOGLE_DNS_IP4_8844 = ip("8.8.4.4") 10 | GOOGLE_DNS_IP6_8888 = ip("2001:4860:4860::8888") 11 | GOOGLE_DNS_IP6_8844 = ip("2001:4860:4860::8844") 12 | GOOGLE_ASN = 15169 13 | 14 | 15 | class TestIpInfo(unittest.TestCase): 16 | @classmethod 17 | def setUpClass(cls): 18 | cls._ip_service = ii.create_default_ip_info_service() 19 | 20 | def test_ip4_to_as(self): 21 | self.assertEqual(GOOGLE_ASN, self._ip_service.get_as( 22 | GOOGLE_DNS_IP4_8888).id) 23 | self.assertEqual(GOOGLE_ASN, self._ip_service.get_as( 24 | GOOGLE_DNS_IP4_8888).id) 25 | self.assertEqual(-1, self._ip_service.get_as(ip("127.0.0.1")).id) 26 | 27 | def test_ip6_to_as(self): 28 | self.assertEqual(GOOGLE_ASN, self._ip_service.get_as( 29 | GOOGLE_DNS_IP6_8888).id) 30 | self.assertEqual(GOOGLE_ASN, self._ip_service.get_as( 31 | GOOGLE_DNS_IP6_8844).id) 32 | self.assertEqual(-1, self._ip_service.get_as(ip("::1")).id) 33 | 34 | def test_ip4_to_country(self): 35 | # nycourts.gov 36 | self.assertEqual(("US", "United States"), 37 | self._ip_service.get_country(ip("207.29.128.60"))) 38 | # Technical Research Centre of Finland 39 | self.assertEqual(("FI", "Finland"), 40 | self._ip_service.get_country(ip("130.188.0.0"))) 41 | self.assertEqual(("ZZ", "Unknown"), 42 | self._ip_service.get_country(ip("127.0.0.1"))) 43 | 44 | def test_ip6_to_country(self): 45 | # Instituto Costarricense de Electricidad y Telecom 46 | self.assertEqual(("CR", "Costa Rica"), self._ip_service.get_country( 47 | ip("2001:1330::"))) 48 | # Wikimedia Foundation 49 | self.assertEqual(("US", "United States"), self._ip_service.get_country( 50 | ip("2620:62:c000::"))) 51 | self.assertEqual(("ZZ", "Unknown"), 52 | self._ip_service.get_country(ip("::1"))) 53 | 54 | def test_resolve_ip4(self): 55 | self.assertEqual( 56 | "dns.google", self._ip_service.resolve_ip(GOOGLE_DNS_IP4_8888)) 57 | self.assertEqual( 58 | "dns.google", self._ip_service.resolve_ip(GOOGLE_DNS_IP4_8888)) 59 | self.assertEqual( 60 | "localhost", self._ip_service.resolve_ip(ip("127.0.0.1"))) 61 | 62 | def test_resolve_ip6(self): 63 | self.assertEqual("dns.google", self._ip_service.resolve_ip( 64 | GOOGLE_DNS_IP6_8888)) 65 | self.assertEqual("dns.google", self._ip_service.resolve_ip( 66 | GOOGLE_DNS_IP6_8844)) 67 | self.assertEqual( 68 | "localhost", self._ip_service.resolve_ip(ip("::1"))) 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /netanalysis/ip/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2017 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import abc 18 | from collections import namedtuple 19 | from enum import Enum 20 | import ipaddress 21 | from typing import Union 22 | 23 | IpAddress = Union[ipaddress.IPv4Address, ipaddress.IPv6Address] 24 | 25 | AsOrg = namedtuple("AsOrg", ["id", "name", "country", "source", "date_changed_str"]) 26 | 27 | 28 | class AsType(Enum): 29 | UNKNOWN = 0 30 | CONTENT = 1 31 | ENTERPRISE = 1 32 | TRANSIT_ACCESS = 2 33 | 34 | 35 | class AutonomousSystem(abc.ABC): 36 | @abc.abstractmethod 37 | def id(self) -> int: pass 38 | @abc.abstractmethod 39 | def name(self) -> str: pass 40 | @abc.abstractmethod 41 | def org(self) -> AsOrg: pass 42 | @abc.abstractmethod 43 | def type(self) -> AsType: pass 44 | 45 | 46 | class AsRepository(abc.ABC): 47 | @abc.abstractmethod 48 | def get_as(self, as_number: int) -> AutonomousSystem: 49 | pass 50 | -------------------------------------------------------------------------------- /netanalysis/ip/simple_autonomous_system.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2017 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import gzip 18 | from typing import Dict 19 | 20 | from netanalysis.infrastructure.resources import resource_filename 21 | 22 | from . import model 23 | 24 | 25 | class SimpleAutonomousSystem(model.AutonomousSystem): 26 | def __init__(self, as_repo: model.AsRepository, as_number: int, as_name: str, org_id: str, 27 | source: str, date_changed_str: str) -> None: 28 | self._as_repo = as_repo 29 | self._id = as_number 30 | self._name = as_name 31 | self._org_id = org_id 32 | self.source = source 33 | self.date_changed_str = date_changed_str 34 | self._type = model.AsType.UNKNOWN 35 | 36 | @property 37 | def id(self): return self._id 38 | 39 | @property 40 | def name(self): return self._name 41 | 42 | @property 43 | def type(self): return self._type 44 | 45 | @type.setter 46 | def type(self, new_type): self._type = new_type 47 | 48 | @property 49 | def org(self): 50 | return self._as_repo.get_org(self._org_id) 51 | 52 | def __repr__(self): 53 | return "%s(%r)" % (self.__class__, self.__dict__) 54 | 55 | 56 | def UnknownAutonomousSystem(as_repo, as_number): 57 | return SimpleAutonomousSystem(as_repo, as_number, "AS%d" % as_number, None, None, None) 58 | 59 | 60 | def UnknownAsOrg(org_id): 61 | return model.AsOrg(org_id, org_id, None, None, None) 62 | 63 | 64 | class InMemoryAsRepository(model.AsRepository): 65 | def __init__(self) -> None: 66 | self.id_as: Dict[int, model.AutonomousSystem] = {} 67 | self.id_org: Dict[str, model.AsOrg] = {} 68 | 69 | def add_as(self, as_number: int, as_name: str, org_id: str, 70 | source: str, date_changed_str: str) -> None: 71 | self.id_as[as_number] = SimpleAutonomousSystem( 72 | self, as_number, as_name, org_id, source, date_changed_str) 73 | 74 | def add_org(self, org_id: str, org_name: str, org_country: str, 75 | source: str, date_changed_str: str) -> None: 76 | self.id_org[org_id] = model.AsOrg( 77 | org_id, org_name, org_country, source, date_changed_str) 78 | 79 | def get_as(self, as_number: int) -> model.AutonomousSystem: 80 | autonomous_system = self.id_as.get(as_number) 81 | if not autonomous_system: 82 | return UnknownAutonomousSystem(self, as_number) 83 | return autonomous_system 84 | 85 | def get_org(self, org_id: str) -> model.AsOrg: 86 | org = self.id_org.get(org_id) 87 | if not org: 88 | return UnknownAsOrg(org_id) 89 | return org 90 | 91 | 92 | def fill_as_info_from_filename(as_org_filename: str, as_repo: InMemoryAsRepository): 93 | with gzip.open(as_org_filename, "rt") as as_file: 94 | return fill_as_info_from_file(as_file, as_repo) 95 | 96 | 97 | def fill_as_info_from_file(as_org_file, as_repo: InMemoryAsRepository): 98 | mode = None 99 | while True: 100 | line = as_org_file.readline() 101 | if len(line) == 0: 102 | break 103 | line = line.strip() 104 | if not line or line[0] == "#": 105 | if line.startswith("# format:org_id"): 106 | mode = "org" 107 | elif line.startswith("# format:aut"): 108 | mode = "as" 109 | continue 110 | if mode == "as": 111 | as_number_str, date_changed_str, as_name, org_id, opaque_id, source = \ 112 | line.split("|") 113 | as_number = int(as_number_str) 114 | as_repo.add_as(as_number, as_name, org_id, 115 | source, date_changed_str) 116 | elif mode == "org": 117 | org_id, date_changed_str, org_name, org_country, source = \ 118 | line.split("|") 119 | as_repo.add_org(org_id, org_name, org_country, 120 | source, date_changed_str) 121 | 122 | 123 | def fill_as_type_from_filename(filename: str, as_repo: InMemoryAsRepository): 124 | with gzip.open(filename, "rt") as as_type_file: 125 | fill_as_type_from_file(as_type_file, as_repo) 126 | 127 | 128 | def fill_as_type_from_file(as_type_file, as_repo: InMemoryAsRepository): 129 | str_to_type = { 130 | "Content": model.AsType.CONTENT, 131 | "Enterprise": model.AsType.ENTERPRISE, 132 | "Transit/Access": model.AsType.TRANSIT_ACCESS, 133 | } 134 | for line in as_type_file: 135 | line = line.strip() 136 | if not line or line[0] == "#": 137 | continue 138 | as_number_str, _source, as_type_str = line.split("|") 139 | as_number = int(as_number_str) 140 | asys = as_repo.get_as(as_number) 141 | if asys: 142 | asys.type = str_to_type[as_type_str] 143 | 144 | 145 | def create_default_as_repo() -> InMemoryAsRepository: 146 | as_repo = InMemoryAsRepository() 147 | 148 | as_info_filename = resource_filename( 149 | "third_party/caida.org/as-organizations/as-org2info.txt.gz") 150 | fill_as_info_from_filename(as_info_filename, as_repo) 151 | 152 | as_type_filename = resource_filename( 153 | "third_party/caida.org/as-classification/as2types.txt.gz") 154 | fill_as_type_from_filename(as_type_filename, as_repo) 155 | return as_repo 156 | -------------------------------------------------------------------------------- /netanalysis/ip/test_simple_autonomous_system.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2017 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import ipaddress 18 | import unittest 19 | 20 | from . import ip_info as ii 21 | from . import simple_autonomous_system as sas 22 | 23 | 24 | class TestAsRepository(unittest.TestCase): 25 | def test_unknown_as(self): 26 | as_repo = sas.InMemoryAsRepository() 27 | asys = as_repo.get_as(999) 28 | self.assertIsNotNone(asys) 29 | self.assertEqual(999, asys.id) 30 | self.assertEqual("AS999", asys.name) 31 | 32 | def test_unknown_org(self): 33 | as_repo = sas.InMemoryAsRepository() 34 | org = as_repo.get_org("orgX") 35 | self.assertIsNotNone(org) 36 | self.assertEqual("orgX", org.id) 37 | 38 | def test_add_as(self): 39 | as_repo = sas.InMemoryAsRepository() 40 | as_repo.add_as("AS1", "First AS", "org1", "test_data", "sometime") 41 | as1 = as_repo.get_as("AS1") 42 | self.assertEqual("AS1", as1.id) 43 | self.assertEqual("First AS", as1.name) 44 | self.assertEqual("org1", as1.org.id) 45 | 46 | def test_add_org(self): 47 | as_repo = sas.InMemoryAsRepository() 48 | as_repo.add_org("org1", "First Org", "Country1", 49 | "test_data", "sometime") 50 | org = as_repo.get_org("org1") 51 | self.assertEqual("org1", org.id) 52 | self.assertEqual("First Org", org.name) 53 | self.assertEqual("Country1", org.country) 54 | 55 | def test_as_org(self): 56 | as_repo = sas.InMemoryAsRepository() 57 | as_repo.add_as("AS1", "First AS", "org1", "test_data", "sometime") 58 | as_repo.add_org("org1", "First Org", "Country1", 59 | "test_data", "sometime") 60 | org = as_repo.get_as("AS1").org 61 | self.assertEqual("org1", org.id) 62 | self.assertEqual("First Org", org.name) 63 | self.assertEqual("Country1", org.country) 64 | 65 | 66 | class TestIpToAsnMap(unittest.TestCase): 67 | def test_default_data(self): 68 | ip_info = ii.create_default_ip_info_service() 69 | self.assertEqual(15169, ip_info.get_as(ipaddress.ip_address("8.8.8.8")).id) 70 | self.assertEqual(13335, ip_info.get_as(ipaddress.ip_address("1.1.1.1")).id) 71 | self.assertEqual(-1, ip_info.get_as(ipaddress.ip_address("127.0.0.1")).id) 72 | 73 | 74 | if __name__ == '__main__': 75 | unittest.main() 76 | -------------------------------------------------------------------------------- /netanalysis/ooni/README.md: -------------------------------------------------------------------------------- 1 | # OONI Measurement Data 2 | 3 | This directory contains libraries and tools to fetch [OONI data](https://ooni.org/data/). This tool supports access to the measurements on both the new `ooni-data-eu-fra` and the old `ooni-data` OONI S3 buckets. 4 | 5 | You can install the library directly from Gihub with 6 | 7 | pip install git+git://github.com/Jigsaw-Code/net-analysis.git@master 8 | 9 | 10 | ## Fetch the measurement data 11 | 12 | To fetch the last 14 days of data for a country into a `ooni_data/` directory: 13 | 14 | python -m netanalysis.ooni.data.sync_measurements --country=BY --output_dir=./ooni_data/ 15 | 16 | If you call it a second time, it will skip data already downloaded. 17 | 18 | Use `--first_date` and `--last_date` to restrict the fetch to a specific, inclusive, date range. For example: 19 | 20 | python -m netanalysis.ooni.data.sync_measurements --output_dir=./ooni_data/ --country=BY --first_date=2021-01-01 --last_date=2021-01-31 21 | 22 | ### Data trimming 23 | By default the tool will drop any measurement field that are longer than 1000 characters in order to save space. You can change that by passing a different value for `--max_string_size`. 24 | 25 | This is primarily intended to drop the response bodies, which are often not needed and take most of the space. For the date range example above, we download 158 MiB of data, but only store 18 MiB after the trimming, a nearly 9x difference! 26 | 27 | ### Test types 28 | By default the tool will download `webconnectivity` tests only. You can select a different test type with `--test_type`. 29 | 30 | ### Cost limit 31 | The tool also stops fetching data after if downloads enough data to cost OONI an estimated $1.00 USD in order to prevent accidental costs. You can override that with `--cost_limit_usd`. 32 | 33 | ## Direct S3 access 34 | 35 | You can use the aws cli to access the bucket. For example: 36 | ``` 37 | aws --no-sign-request s3 ls s3://ooni-data-eu-fra/raw/20210526/00/VE/webconnectivity/ 38 | ``` 39 | 40 | You can fetch data for a country, test type and date with this: 41 | ``` 42 | aws --no-sign-request s3 sync --recursive --exclude '*' --include '*/VE/webconnectivity/*.jsonl.gz' 's3://ooni-data-eu-fra/raw/20210501' ./ooni-data/20210501/ 43 | ``` 44 | 45 | However that's only for the new bucket. Our script supports the old bucket, is faster, allows you to specify a date range, limits cost, and trims the stored data. 46 | -------------------------------------------------------------------------------- /netanalysis/ooni/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/ooni/__init__.py -------------------------------------------------------------------------------- /netanalysis/ooni/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/ooni/data/__init__.py -------------------------------------------------------------------------------- /netanalysis/ooni/data/ooni_client.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Jigsaw Operations LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import datetime as dt 16 | from contextlib import closing 17 | import gzip 18 | from pathlib import PosixPath 19 | import posixpath 20 | from typing import Callable, Dict, Iterable 21 | from urllib.parse import SplitResult 22 | 23 | import boto3 24 | from botocore import UNSIGNED 25 | from botocore.config import Config 26 | import lz4.frame 27 | import ujson 28 | 29 | 30 | class FileEntry: 31 | """Represents a file entry in the OONI S3 Bucket.""" 32 | 33 | def __init__(self, get_measurements: Callable[[], Iterable[object]], test_type: str, country: str, date: dt.date, url: SplitResult, size: int) -> None: 34 | self.get_measurements = get_measurements 35 | self.test_type = test_type 36 | self.country = country 37 | self.date = date 38 | self.url = url 39 | self.size = size 40 | 41 | 42 | class OoniClient: 43 | def __init__(self): 44 | s3_client = boto3.client( 45 | 's3', config=Config(signature_version=UNSIGNED)) 46 | self._new_client = _2020OoniClient(s3_client) 47 | self._legacy_client = _LegacyOoniClient(s3_client) 48 | 49 | @property 50 | def num_list_requests(self) -> int: 51 | return self._new_client.num_list_requests + self._legacy_client.num_list_requests 52 | 53 | @property 54 | def num_get_requests(self) -> int: 55 | return self._new_client.num_get_requests + self._legacy_client.num_get_requests 56 | 57 | @property 58 | def bytes_downloaded(self) -> int: 59 | return self._new_client.bytes_downloaded + self._legacy_client.bytes_downloaded 60 | 61 | @property 62 | def cost_usd(self) -> float: 63 | # From https://aws.amazon.com/s3/pricing/ 64 | data_cost = 0.09 * self.bytes_downloaded / 2**30 # $0.09 per GiB 65 | request_cost = (0.0004 * self.num_get_requests / 1000 + 66 | 0.005 * self.num_list_requests / 1000) 67 | return data_cost + request_cost 68 | 69 | def list_files(self, first_date: dt.date, last_date: dt.date, test_type: str, country: str) -> Iterable[FileEntry]: 70 | yield from self._legacy_client.list_files(first_date, last_date, test_type, country) 71 | yield from self._new_client.list_files(first_date, last_date, test_type, country) 72 | 73 | 74 | class _2020OoniClient: 75 | _BUCKET = 'ooni-data-eu-fra' 76 | _PREFIX = 'raw/' 77 | 78 | def __init__(self, s3_client): 79 | self._s3_client = s3_client 80 | self.num_get_requests = 0 81 | self.num_list_requests = 0 82 | self.bytes_downloaded = 0 83 | 84 | # Example files: `aws --no-sign-request s3 ls s3://ooni-data-eu-fra/raw/20210526/00/VE/webconnectivity/` 85 | # First directory in the new bucket is 20201020/ 86 | def list_files(self, first_date: dt.date, last_date: dt.date, test_type: str, country: str) -> Iterable[FileEntry]: 87 | if last_date < dt.date(2020, 10, 20): 88 | return 89 | paginator = self._s3_client.get_paginator('list_objects_v2') 90 | pages = paginator.paginate(Bucket=_2020OoniClient._BUCKET, Delimiter='/', Prefix=_2020OoniClient._PREFIX, 91 | StartAfter=f'{_2020OoniClient._PREFIX}{first_date.strftime("%Y%m%d")}') 92 | for page in pages: 93 | self.num_list_requests += 1 94 | for entry in page.get('CommonPrefixes', []): 95 | date_dir = entry['Prefix'] 96 | date_str = posixpath.basename(posixpath.dirname(date_dir)) 97 | date = dt.datetime.strptime(date_str, "%Y%m%d").date() 98 | if date > last_date: 99 | return 100 | for hour in range(24): 101 | prefix = f'''{date_dir}{hour:02}/{country}/''' 102 | if test_type: 103 | prefix += f'{test_type}/' 104 | for page in paginator.paginate(Bucket=page['Name'], Prefix=prefix): 105 | self.num_list_requests += 1 106 | for entry in page.get('Contents', []): 107 | key = entry['Key'] 108 | file_path = PosixPath(key) 109 | if file_path.name.endswith('.jsonl.gz'): 110 | file_test_type = file_path.parent.name 111 | url = SplitResult( 112 | 's3', page['Name'], key, None, None) 113 | yield FileEntry(lambda: self._get_measurements(url), file_test_type, country, date, url, entry['Size']) 114 | 115 | def _get_measurements(self, url: SplitResult) -> Iterable[Dict]: 116 | s3_object = self._s3_client.get_object(Bucket=url.netloc, Key=url.path) 117 | self.num_get_requests += 1 118 | self.bytes_downloaded += s3_object['ContentLength'] 119 | with closing(s3_object['Body']) as remote_file, gzip.GzipFile(fileobj=remote_file, mode='r') as source_file: 120 | for line in source_file: 121 | yield ujson.loads(line) 122 | 123 | 124 | class _LegacyOoniClient: 125 | _BUCKET = 'ooni-data' 126 | _PREFIX = 'autoclaved/jsonl.tar.lz4/' 127 | 128 | @staticmethod 129 | def _test_type_for_match(measurement_type: str): 130 | return measurement_type.replace('_', '') 131 | 132 | # Example filename: 20200801T144129Z-BR-AS28573-web_connectivity-20200801T144133Z_AS28573_hlwQt15JxAkU6kYEfTrZL8JbTrTY06WzBRAUIu6zR4b6H3ww7m-0.2.0-probe.json.lz4 133 | @staticmethod 134 | def _filename_matches(filename: str, test_type: str, country: str) -> bool: 135 | basename = posixpath.basename(filename) 136 | parts = basename.split('-') 137 | if len(parts) < 4: 138 | return False 139 | return parts[1] == country and _LegacyOoniClient._test_type_for_match(parts[3]) == test_type 140 | 141 | @staticmethod 142 | def _files_from_index(json_lines: Iterable[str], test_type: str, country: str): 143 | # Format defined at https://ooni.org/post/mining-ooni-data/ 144 | # file is a lz4 file on S3. Key "filename" is the file name. 145 | # report is a standalone json.lz4 file, or a file embedded in a tar.lz4 file set. Keys "textname" is the jsonl report name. 146 | # datum is a single measurement as a JSON object. 147 | # frame is a LZ4 frame with multiple measurements. Its boundaries don't necessarily align with files or reports. 148 | current_file = {} 149 | current_frame = {} 150 | output_measurements = False 151 | for line in json_lines: 152 | entry = ujson.loads(line) 153 | if entry['type'] == 'file': 154 | current_file = entry 155 | current_file['frames'] = [] 156 | elif entry['type'] == '/file': 157 | if len(current_file.get('frames', [])) > 0: 158 | yield current_file 159 | current_file = {} 160 | elif entry['type'] == 'report': 161 | report_name = entry['textname'] 162 | if _LegacyOoniClient._filename_matches(report_name, test_type, country): 163 | output_measurements = True 164 | elif entry['type'] == '/report': 165 | output_measurements = False 166 | elif entry['type'] == 'frame': 167 | current_frame = entry 168 | current_frame['data'] = [] 169 | elif entry['type'] == '/frame': 170 | if len(current_frame.get('data', [])) > 0: 171 | current_file['frames'].append(current_frame) 172 | current_frame = {} 173 | elif entry['type'] == 'datum': 174 | if output_measurements: 175 | current_frame['data'].append(entry) 176 | 177 | @staticmethod 178 | def _frame_bytes(frames: Iterable[Dict]) -> int: 179 | bytes = 0 180 | for frame in frames: 181 | bytes += frame['file_size'] 182 | return bytes 183 | 184 | def __init__(self, s3_client): 185 | self._s3_client = s3_client 186 | self.num_get_requests = 0 187 | self.num_list_requests = 0 188 | self.bytes_downloaded = 0 189 | 190 | # Example files: `aws --no-sign-request s3 ls s3://ooni-data/autoclaved/jsonl.tar.lz4/2020-08-01/` 191 | # First directory is 2012-12-05/, last is 2020-10-21/. 192 | def list_files(self, first_date: dt.date, last_date: dt.date, test_type: str, country: str) -> Iterable[FileEntry]: 193 | if first_date > dt.date(2020, 10, 21): 194 | return 195 | paginator = self._s3_client.get_paginator('list_objects_v2') 196 | pages = paginator.paginate(Bucket=_LegacyOoniClient._BUCKET, Delimiter='/', Prefix=_LegacyOoniClient._PREFIX, 197 | StartAfter=f'{_LegacyOoniClient._PREFIX}{first_date.strftime("%Y-%m-%d")}') 198 | for page in pages: 199 | self.num_list_requests += 1 200 | for entry in page.get('CommonPrefixes', []): 201 | date_dir = entry['Prefix'] 202 | date_str = posixpath.basename(posixpath.dirname(date_dir)) 203 | date = dt.datetime.strptime(date_str, "%Y-%m-%d").date() 204 | if date > last_date: 205 | return 206 | for file_entry in self._list_files_with_index(date_dir, test_type, country): 207 | url = SplitResult('s3', _LegacyOoniClient._BUCKET, f'{_LegacyOoniClient._PREFIX}{file_entry["filename"]}', None, None) 208 | yield FileEntry(lambda: self._get_measurements(file_entry), test_type, country, date, url, _LegacyOoniClient._frame_bytes(file_entry['frames'])) 209 | 210 | def _list_files_with_index(self, date_dir: str, test_type: str, country: str) -> Iterable[Dict]: 211 | s3_object = self._s3_client.get_object( 212 | Bucket=_LegacyOoniClient._BUCKET, Key=f'{date_dir}index.json.gz') 213 | self.num_get_requests += 1 214 | self.bytes_downloaded += s3_object['ContentLength'] 215 | with gzip.open(s3_object['Body'], mode='rt', encoding='utf8') as json_lines: 216 | yield from _LegacyOoniClient._files_from_index(json_lines, test_type, country) 217 | 218 | def _get_measurements(self, file_entry: Dict) -> Iterable[Dict]: 219 | s3_key = f'{_LegacyOoniClient._PREFIX}{file_entry["filename"]}' 220 | frames = file_entry['frames'] 221 | fi = 0 222 | while fi < len(frames): 223 | # We merge adjacent frames into segments to reduce the number of requests. 224 | segment_start = frames[fi]['file_off'] 225 | segment_end = segment_start 226 | segment = [] 227 | while fi < len(frames) and frames[fi]['file_off'] == segment_end: 228 | segment_end += frames[fi]['file_size'] 229 | segment.append(frames[fi]) 230 | fi += 1 231 | stream = self._s3_client.get_object( 232 | Bucket=_LegacyOoniClient._BUCKET, Key=s3_key, Range=f'{segment_start}-{segment_end - 1}')['Body'] 233 | self.num_get_requests += 1 234 | self.bytes_downloaded += segment_end - segment_start 235 | with lz4.frame.LZ4FrameFile(stream, mode='r') as lz4_file: 236 | bytes_read = 0 237 | for frame in segment: 238 | for entry in frame['data']: 239 | skip = entry['text_off'] - bytes_read 240 | if skip > 0: 241 | lz4_file.read(skip) 242 | measurement_str = lz4_file.read(size=entry['text_size']) 243 | bytes_read = entry['text_off'] + entry['text_size'] 244 | yield ujson.loads(measurement_str) 245 | -------------------------------------------------------------------------------- /netanalysis/ooni/data/sync_measurements.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2021 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import argparse 18 | import datetime as dt 19 | from functools import singledispatch 20 | import gzip 21 | import logging 22 | from multiprocessing.pool import ThreadPool 23 | import os 24 | import pathlib 25 | import sys 26 | from typing import List 27 | 28 | import ujson 29 | 30 | from . import ooni_client 31 | 32 | 33 | @singledispatch 34 | def trim_measurement(json_obj, max_string_size: int): 35 | return json_obj 36 | 37 | 38 | @trim_measurement.register(dict) 39 | def _(json_dict: dict, max_string_size: int): 40 | keys_to_delete: List[str] = [] 41 | for key, value in json_dict.items(): 42 | if type(value) == str and len(value) > max_string_size: 43 | keys_to_delete.append(key) 44 | else: 45 | trim_measurement(value, max_string_size) 46 | for key in keys_to_delete: 47 | del json_dict[key] 48 | return json_dict 49 | 50 | 51 | @trim_measurement.register(list) 52 | def _(json_list: list, max_string_size: int): 53 | for item in json_list: 54 | trim_measurement(item, max_string_size) 55 | return json_list 56 | 57 | 58 | class CostLimitError(Exception): 59 | def __init__(self, message: str) -> None: 60 | super().__init__(message) 61 | 62 | def _make_local_path(output_dir: pathlib.Path, entry: ooni_client.FileEntry) -> pathlib.Path: 63 | basename = pathlib.PurePosixPath(entry.url.path).name 64 | # Convert .json.lz4 and .tar.lz4 filenames. 65 | if not basename.endswith('.jsonl.gz'): 66 | basename = basename.rsplit('.', 2)[0] + '.jsonl.gz' 67 | return output_dir / entry.country / entry.test_type / f'{entry.date:%Y-%m-%d}' / basename 68 | 69 | 70 | def main(args): 71 | logging.basicConfig(level=logging.INFO) 72 | if args.debug: 73 | logging.basicConfig(level=logging.DEBUG) 74 | 75 | ooni = ooni_client.OoniClient() 76 | num_measurements = 0 77 | file_entries = ooni.list_files( 78 | args.first_date, args.last_date, args.test_type, args.country) 79 | 80 | def sync_file(entry: ooni_client.FileEntry): 81 | target_file_path = _make_local_path(args.output_dir, entry) 82 | if target_file_path.is_file(): 83 | return f'Skipped existing {entry.url.geturl()}]' 84 | return fetch_file(entry, target_file_path) 85 | 86 | def fetch_file(entry: ooni_client.FileEntry, target_file_path: pathlib.Path): 87 | nonlocal num_measurements 88 | os.makedirs(target_file_path.parent, exist_ok=True) 89 | if ooni.cost_usd > args.cost_limit_usd: 90 | raise CostLimitError( 91 | f'Downloaded {ooni.bytes_downloaded / 2**20} MiB') 92 | # We use a temporary file to atomatically write the destination and make sure we don't have partially written files. 93 | # We put the temporary file in the same location as the destination because you can't atomically 94 | # rename if they are in different devices, as is the case for Kaggle. 95 | temp_path = target_file_path.with_name(f'{target_file_path.name}.tmp') 96 | try: 97 | with gzip.open(temp_path, mode='wt', encoding='utf-8', newline='\n') as target_file: 98 | for measurement in entry.get_measurements(): 99 | num_measurements += 1 100 | m = trim_measurement(measurement, args.max_string_size) 101 | ujson.dump(m, target_file) 102 | target_file.write('\n') 103 | temp_path.replace(target_file_path) 104 | except: 105 | temp_path.unlink() 106 | raise 107 | return f'Downloaded {entry.url.geturl()} [{entry.size:,} bytes]' 108 | 109 | with ThreadPool(processes=5 * os.cpu_count()) as sync_pool: 110 | for msg in sync_pool.imap_unordered(sync_file, file_entries): 111 | logging.info(msg) 112 | 113 | logging.info(f'Measurements: {num_measurements}, Downloaded {ooni.bytes_downloaded/2**20:0.3f} MiB, Estimated Cost: ${ooni.cost_usd:02f}') 114 | 115 | 116 | def _parse_date_flag(date_str: str) -> dt.date: 117 | return dt.datetime.strptime(date_str, "%Y-%m-%d").date() 118 | 119 | 120 | if __name__ == "__main__": 121 | parser = argparse.ArgumentParser("Sync OONI measurements") 122 | parser.add_argument("--country", type=str, required=True) 123 | parser.add_argument("--first_date", type=_parse_date_flag, 124 | default=dt.date.today() - dt.timedelta(days=14)) 125 | parser.add_argument("--last_date", type=_parse_date_flag, 126 | default=dt.date.today()) 127 | parser.add_argument("--test_type", type=str, default='webconnectivity') 128 | parser.add_argument("--max_string_size", type=int, default=1000) 129 | parser.add_argument("--cost_limit_usd", type=float, default=1.00) 130 | parser.add_argument("--output_dir", type=pathlib.Path, required=True) 131 | parser.add_argument("--debug", action="store_true") 132 | sys.exit(main(parser.parse_args())) 133 | -------------------------------------------------------------------------------- /netanalysis/ooni/measurements_to_dns_records.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2018 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | Reads OONI measurements and outputs the DNS resource records. 19 | 20 | Sample usage: 21 | python -m netanalysis.ooni.measurements_to_dns_records \ 22 | --ooni_measurements_dir=ooni_data/ 23 | """ 24 | 25 | import argparse 26 | import datetime 27 | import glob 28 | import ipaddress 29 | import logging 30 | import os 31 | import os.path 32 | import pprint 33 | import sys 34 | from typing import Iterable, List 35 | from urllib.parse import urlparse 36 | 37 | import ujson as json 38 | 39 | from netanalysis.dns.data import model as dns 40 | from netanalysis.dns.data import serialization as ds 41 | 42 | 43 | def parse_ooni_date(date_str: str) -> datetime.datetime: 44 | # TODO: Set the timezone 45 | return datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") 46 | 47 | 48 | def get_control_dns_measurement(measurement, measurement_id): 49 | measurement_time = parse_ooni_date( 50 | measurement.get("measurement_start_time")).isoformat() 51 | 52 | try: 53 | addresses = measurement["test_keys"]["control"]["dns"]["addrs"] 54 | except KeyError: 55 | raise ValueError("OONI Control Measurement without test_keys.control.dns.addrs: %s" % 56 | pprint.pformat(measurement, compact=True)) 57 | if not addresses: 58 | raise ValueError("OONI Control Measurement with empty test_keys.control.dns.addrs: %s" % 59 | pprint.pformat(measurement, compact=True)) 60 | records: List[dns.ResourceRecord] = [] 61 | last_cname = urlparse(measurement.get("input")).hostname 62 | for address in addresses: 63 | try: 64 | records.append(dns.ResourceRecord( 65 | last_cname, dns.IpAddressData(address))) 66 | except ValueError: 67 | records.append(dns.ResourceRecord( 68 | last_cname, dns.CnameData(address))) 69 | last_cname = address 70 | 71 | measurement_time = parse_ooni_date( 72 | measurement.get("measurement_start_time")).isoformat() 73 | return dns.DnsMeasurement( 74 | measurement_id="%s:control" % measurement_id, 75 | records=records, 76 | time=measurement_time, 77 | provenance="ooni:%s" % measurement_id, 78 | trust_reason="IN_OONI_CONTROL" 79 | ) 80 | 81 | 82 | def get_experiment_dns_measurement(measurement, measurement_id) -> dns.DnsMeasurement: 83 | measurement_time = parse_ooni_date( 84 | measurement.get("measurement_start_time")).isoformat() 85 | try: 86 | ooni_queries = measurement["test_keys"]["queries"] 87 | except KeyError: 88 | raise ValueError("OONI Measurement without test_keys.queries: %s" % 89 | pprint.pformat(measurement, compact=True)) 90 | if not ooni_queries: 91 | raise ValueError("OONI Measurement with empty test_keys.queries: %s" % 92 | pprint.pformat(measurement, compact=True)) 93 | records: List[dns.ResourceRecord] = [] 94 | for ooni_query in ooni_queries: 95 | last_cname = ooni_query.get("hostname") 96 | if not last_cname: 97 | logging.warning("Missing hostname in query %s", ooni_query) 98 | for ooni_answer in ooni_query.get("answers"): 99 | cname = ooni_answer.get("hostname") 100 | if cname: 101 | if cname == last_cname: 102 | continue 103 | records.append(dns.ResourceRecord( 104 | last_cname, dns.CnameData(cname))) 105 | last_cname = cname 106 | else: 107 | ip_str = ooni_answer.get("ipv4") or ooni_answer.get("ipv6") 108 | if ip_str: 109 | try: 110 | records.append(dns.ResourceRecord( 111 | last_cname, dns.IpAddressData(ip_str))) 112 | except ValueError: 113 | logging.warning( 114 | "Measurement %s: invalid IP answer %s", measurement["id"], ip_str) 115 | measurement_time = parse_ooni_date( 116 | measurement.get("measurement_start_time")).isoformat() 117 | resolver_ip_str = measurement["test_keys"].get("client_resolver") 118 | resolver_ip = ipaddress.ip_address( 119 | resolver_ip_str) if resolver_ip_str else None 120 | return dns.DnsMeasurement( 121 | measurement_id=measurement_id, 122 | records=records, 123 | time=measurement_time, 124 | resolver_ip=resolver_ip, 125 | client_asn=int(measurement.get("probe_asn")[2:]), 126 | client_country=measurement.get("probe_cc"), 127 | provenance="ooni:%s" % measurement_id, 128 | ) 129 | 130 | 131 | def read_ooni_dns_measurements(ooni_measurements_dir: str) -> Iterable[dns.DnsMeasurement]: 132 | for domain_country_dir in sorted(glob.iglob(os.path.join(ooni_measurements_dir, "*", "*"))): 133 | for filename in glob.iglob(os.path.join(domain_country_dir, "*")): 134 | with open(filename) as file: 135 | measurement = json.load(file) 136 | measurement_id = os.path.splitext( 137 | os.path.basename(filename))[0] 138 | try: 139 | yield get_control_dns_measurement(measurement, measurement_id) 140 | except ValueError as e: 141 | logging.debug(e) 142 | try: 143 | yield get_experiment_dns_measurement(measurement, measurement_id) 144 | except ValueError as e: 145 | logging.debug(e) 146 | logging.info("Done with %s", domain_country_dir) 147 | 148 | 149 | def main(args): 150 | logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) 151 | 152 | if not args.dns_measurements: 153 | args.dns_measurements = os.path.join( 154 | args.ooni_measurements_dir, "dns_records.json") 155 | 156 | os.makedirs(os.path.dirname(args.dns_measurements), exist_ok=True) 157 | with open(args.dns_measurements, "w") as dns_measurements: 158 | for measurement in read_ooni_dns_measurements(args.ooni_measurements_dir): 159 | dns_measurements.write(json.dumps(ds.to_json(measurement))) 160 | dns_measurements.write("\n") 161 | 162 | 163 | if __name__ == "__main__": 164 | parser = argparse.ArgumentParser( 165 | "Convert OONI measurements to DNS Resolutions") 166 | parser.add_argument("--ooni_measurements_dir", type=str, required=True) 167 | parser.add_argument("--dns_measurements", type=str) 168 | parser.add_argument("--debug", action="store_true") 169 | sys.exit(main(parser.parse_args())) 170 | -------------------------------------------------------------------------------- /netanalysis/third_party: -------------------------------------------------------------------------------- 1 | ../third_party -------------------------------------------------------------------------------- /netanalysis/tls/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/tls/__init__.py -------------------------------------------------------------------------------- /netanalysis/tls/domain_ip_validator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Jigsaw Operations LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import asyncio 17 | import certifi 18 | import ipaddress 19 | import logging 20 | import pprint 21 | import ssl 22 | import sys 23 | 24 | _SSL_CONTEXT = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=certifi.where()) 25 | _SSL_CONTEXT.check_hostname = False 26 | 27 | 28 | class DomainIpValidator: 29 | async def get_cert(self, domain: str, ip: str, timeout=2.0): 30 | ip = str(ip) 31 | transport, _proto = await asyncio.wait_for(asyncio.get_event_loop().create_connection( 32 | asyncio.Protocol, 33 | host=ip, 34 | port=443, 35 | ssl=_SSL_CONTEXT, 36 | server_hostname=domain), timeout) 37 | transport.close() 38 | return transport.get_extra_info("peercert") 39 | 40 | async def validate_ip(self, domain: str, ip: str, timeout=2.0): 41 | """ 42 | Returns successfully if the IP is valid for the domain. 43 | Raises exception if the validation fails. 44 | """ 45 | cert = await self.get_cert(domain, ip, timeout) 46 | if logging.getLogger().isEnabledFor(logging.DEBUG): 47 | logging.debug("Certificate:\n{}".format(pprint.pformat(cert))) 48 | ssl.match_hostname(cert, domain) 49 | 50 | 51 | def main(args): 52 | logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) 53 | 54 | validator = DomainIpValidator() 55 | all_good = True 56 | for ip_address in args.ip_address: 57 | try: 58 | asyncio.get_event_loop().run_until_complete( 59 | validator.validate_ip(args.domain, str(ip_address), timeout=args.timeout)) 60 | result_str = "VALID" 61 | except (ssl.CertificateError, ConnectionRefusedError, OSError, asyncio.TimeoutError) as e: 62 | all_good = False 63 | result_str = "UNKNOWN (%s)" % repr(e) 64 | print("IP {} is {}".format(ip_address, result_str)) 65 | return 0 if all_good else 1 66 | 67 | 68 | if __name__ == "__main__": 69 | parser = argparse.ArgumentParser( 70 | "Checks if the given IP addresses are valid for the domain") 71 | parser.add_argument("domain", type=str, 72 | help="The domain to validate the IPs for") 73 | parser.add_argument("ip_address", type=ipaddress.ip_address, 74 | nargs="+", help="The IP address to query") 75 | parser.add_argument("--debug", action="store_true") 76 | parser.add_argument("--timeout", type=float, default=2.0, 77 | help="Timeout in seconds for getting the certificate") 78 | sys.exit(main(parser.parse_args())) 79 | -------------------------------------------------------------------------------- /netanalysis/traffic/README.md: -------------------------------------------------------------------------------- 1 | # Google Traffic Data 2 | 3 | This directory contains libraries and tools to fetch the [Google Transparency Report traffic data](https://transparencyreport.google.com/traffic/overview). 4 | Note that each time-series is normalized, with added noise. They are not the actual traffic numbers. 5 | 6 | ## API Usage Notice 7 | 8 | Before using this code, you must agree to [Google APIs Terms of Service](https://developers.google.com/terms/). 9 | 10 | This code fetches data from an unsupported and undocumented API used by Google's Transparency Report and may break at any time without notice. 11 | 12 | We expect this repository to be a centralized location where the community can update the code if the API changes. 13 | 14 | 15 | ## Fetch the traffic data 16 | 17 | To fetch the data into a `traffic_data/` directory: 18 | 19 | python -m netanalysis.traffic.data.fetch_google_traffic --output_dir=traffic_data/ 20 | 21 | If you call it a second time, it will skip data already downloaded. Delete the output directory if you want the data to be fetched again. 22 | 23 | Use `--products` to restrict the fetch to specific products. For example: 24 | 25 | python -m netanalysis.traffic.data.fetch_google_traffic --output_dir=traffic_data/ --products=BLOGGER,GROUPS,SITES,TRANSLATE,WEB_SEARCH,YOUTUBE 26 | 27 | You can find the list of products at [transparency_report.py](transparency_report.py). 28 | 29 | 30 | ## Find anomalous traffic 31 | 32 | Run the `find_anomalies` tool: 33 | ``` 34 | python -m netanalysis.traffic.analysis.find_anomalies --traffic_data=traffic_data/ --products=BLOGGER,GROUPS,SITES,TRANSLATE,WEB_SEARCH,YOUTUBE > anomalies.txt 35 | ``` 36 | 37 | This will output a sorted list of detected anomalies with the latest one first. The file is formatted in blocks of regional disruptions, with the individual product disruptions in the region indented within the block. 38 | 39 | ## Analyze anomalies 40 | 41 | You can use the [Traffic Correlations](./analysis/TrafficCorrelations.ipynb) IPython Notebook to better analyze the anomalies. 42 | 43 | Start the Jupyter notebook backend: 44 | 45 | ``` 46 | .venv/bin/jupyter notebook 47 | ``` 48 | 49 | And then open `netanalysis/traffic/analysis/TrafficCorrelations.ipynb`. 50 | -------------------------------------------------------------------------------- /netanalysis/traffic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/traffic/__init__.py -------------------------------------------------------------------------------- /netanalysis/traffic/analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/traffic/analysis/__init__.py -------------------------------------------------------------------------------- /netanalysis/traffic/analysis/find_anomalies.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2019 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import argparse 18 | import datetime 19 | import logging 20 | import sys 21 | import time 22 | from typing import List, Iterable 23 | import urllib.parse 24 | 25 | import iso3166 26 | import pandas as pd 27 | import statsmodels.api as sm 28 | 29 | from netanalysis.traffic.analysis import model 30 | import netanalysis.traffic.data.model as traffic 31 | from netanalysis.traffic.data.file_repository import FileTrafficRepository 32 | 33 | logging.getLogger().setLevel(logging.INFO) 34 | 35 | 36 | def get_expectations_1(time_series: pd.Series) -> pd.DataFrame: 37 | # Sets period to 8 weeks. 38 | components = sm.tsa.seasonal_decompose( 39 | time_series, period=7 * 4 * 2, model="additive", two_sided=False) 40 | expected = components.trend + components.seasonal 41 | max_delta = 3 * components.resid.std() 42 | lower_bound = expected - max_delta 43 | upper_bound = expected + max_delta 44 | return pd.DataFrame(index=time_series.index, 45 | data={"expected": expected, 46 | "lower_bound": lower_bound, 47 | "upper_bound": upper_bound}) 48 | 49 | 50 | # def get_expectations_2(time_series): 51 | # # Sets period to 8 weeks. 52 | # components = sm.tsa.seasonal_decompose( 53 | # time_series, period=7 * 4 * 2, model="additive", two_sided=False) 54 | # expected = components.trend + components.seasonal 55 | # window_days = 365 56 | # resid_median = components.resid.rolling( 57 | # center=False, window=window_days).median() 58 | # delta = 4 * 1.4826 * components.resid.rolling(window_days).apply( 59 | # lambda x: np.median(np.fabs(x - np.median(x)))) 60 | # lower_bound = resid_median - delta 61 | # upper_bound = resid_median + delta 62 | # return pd.DataFrame({"expected": expected, "lower_bound": lower_bound, "upper_bound": upper_bound}) 63 | 64 | 65 | def find_anomalies(time_series: pd.Series) -> List[model.AnomalyPoint]: 66 | anomalies = [] # type: List[model.AnomalyPoint] 67 | expectations = get_expectations_1(time_series) 68 | anomalous_dates = (time_series < 69 | expectations.lower_bound).loc[lambda e: e].index # type: List[pd.Timestamp] 70 | mean_traffic = time_series.mean() 71 | for timestamp in anomalous_dates: 72 | relative_impact = ( 73 | expectations.expected[timestamp] - time_series[timestamp]) / mean_traffic 74 | anomalies.append(model.AnomalyPoint( 75 | timestamp.to_pydatetime(), time_series[timestamp], expectations.expected[timestamp], relative_impact)) 76 | return anomalies 77 | 78 | 79 | def group_as_product_disruptions(product_id: traffic.ProductId, 80 | anomalies: Iterable[model.AnomalyPoint], 81 | max_time_delta: datetime.timedelta) -> List[model.ProductDisruption]: 82 | """Groups anomalies that are within the given max_time_delta""" 83 | disruptions = [] # type: List[model.ProductDisruption] 84 | current_disruption = None # type: model.ProductDisruption 85 | disruption_end = datetime.datetime.min 86 | for anomaly in anomalies: 87 | if anomaly.timestamp > disruption_end + max_time_delta: 88 | current_disruption = model.ProductDisruption(product_id) 89 | disruptions.append(current_disruption) 90 | current_disruption.add_anomaly(anomaly) 91 | disruption_end = current_disruption.end 92 | return disruptions 93 | 94 | 95 | def remove_minor_disruptions(product_disruptions: List[model.ProductDisruption]) -> List[model.ProductDisruption]: 96 | return [p for p in product_disruptions if p.relative_impact >= 1.0 and max(a.relative_impact for a in p.anomalies) >= 0.5] 97 | 98 | 99 | def group_as_regional_disruptions( 100 | region_code: str, 101 | product_disruptions: List[model.ProductDisruption]) -> List[model.RegionDisruption]: 102 | region_disruptions = [] # type: List[model.RegionDisruption] 103 | current_region_disruption = None # type: model.RegionDisruption 104 | disruption_end = datetime.datetime.min 105 | for product_disruption in sorted(product_disruptions, key=lambda d: d.start): 106 | if product_disruption.start > disruption_end: 107 | current_region_disruption = model.RegionDisruption(region_code) 108 | region_disruptions.append(current_region_disruption) 109 | current_region_disruption.add_product_disruption(product_disruption) 110 | disruption_end = current_region_disruption.end 111 | return region_disruptions 112 | 113 | 114 | def _to_google_timestamp(timestamp: datetime.datetime): 115 | """Converts a datetime.datetime to the timestamp format used by the Transparency Report""" 116 | return int(time.mktime(timestamp.timetuple()) * 1000) 117 | 118 | 119 | def _make_report_url(start_date: datetime.datetime, end_date: datetime.datetime, region_code: str, product_id: traffic.ProductId): 120 | """Creates a Transparency Report url""" 121 | # Align with the end of the day 122 | end_date = end_date + datetime.timedelta(days=1) 123 | chart_padding = (end_date - start_date) * 2 124 | chart_start_date = start_date - chart_padding 125 | chart_end_date = min(end_date + chart_padding, datetime.datetime.now()) 126 | return ("https://transparencyreport.google.com/traffic/overview?%s" % 127 | urllib.parse.urlencode({ 128 | "lu": "fraction_traffic", 129 | "fraction_traffic": "product:%s;start:%s;end:%s;region:%s" % ( 130 | product_id.value, _to_google_timestamp(chart_start_date), 131 | _to_google_timestamp(chart_end_date), region_code 132 | ) 133 | }) 134 | ) 135 | 136 | 137 | def _make_tor_users_url(start_date: datetime.datetime, end_date: datetime.datetime, region_code: str): 138 | end_date = end_date + datetime.timedelta(days=1) 139 | chart_padding = max(datetime.timedelta(days=7), (end_date - start_date) * 2) 140 | chart_start_date = start_date - chart_padding 141 | chart_end_date = min(end_date + chart_padding, datetime.datetime.now()) 142 | return ("https://metrics.torproject.org/userstats-relay-country.html?%s" % 143 | urllib.parse.urlencode({ 144 | "events": "on", 145 | "start": chart_start_date.date().isoformat(), 146 | "end": chart_end_date.date().isoformat(), 147 | "country": region_code.lower() 148 | }) 149 | ) 150 | 151 | 152 | def _make_context_web_search_url(start_date: datetime.datetime, end_date: datetime.datetime, region_code: str): 153 | return ("https://www.google.com/search?%s" % 154 | urllib.parse.urlencode({ 155 | "q": "internet %s" % iso3166.countries.get(region_code).name, 156 | "tbs": "cdr:1,cd_min:%s,cd_max:%s" % ( 157 | start_date.date().strftime("%m/%d/%Y"), 158 | end_date.date().strftime("%m/%d/%Y") 159 | ) 160 | }) 161 | ) 162 | 163 | 164 | def _make_context_twitter_url(start_date: datetime.datetime, end_date: datetime.datetime, region_code: str): 165 | return ("https://twitter.com/search?%s" % 166 | urllib.parse.urlencode({ 167 | "q": "internet %s since:%s until:%s" % ( 168 | iso3166.countries.get(region_code).name, 169 | start_date.date().isoformat(), 170 | end_date.date().isoformat() 171 | ) 172 | }) 173 | ) 174 | 175 | 176 | def print_disruption_csv(disruption: model.RegionDisruption) -> None: 177 | country_name = iso3166.countries.get(disruption.region_code).name 178 | search_url = _make_context_web_search_url(disruption.start, 179 | disruption.start + datetime.timedelta(days=7), 180 | disruption.region_code) 181 | twitter_url = _make_context_twitter_url(disruption.start, 182 | disruption.start + datetime.timedelta(days=7), 183 | disruption.region_code) 184 | tor_url = _make_tor_users_url(disruption.start, disruption.end, disruption.region_code) 185 | print("%s (%s) %s %s Context: %s %s %s" % ( 186 | country_name, disruption.region_code, disruption.start.date().isoformat(), 187 | disruption.end.date().isoformat(), 188 | search_url, twitter_url, tor_url 189 | )) 190 | for product_disruption in disruption.product_disruptions: 191 | report_url = _make_report_url( 192 | product_disruption.start, product_disruption.end, disruption.region_code, product_disruption.product_id) 193 | print(" %s, %s, %s, %f, %f, %s" % ( 194 | product_disruption.product_id.name, 195 | product_disruption.start.date(), 196 | product_disruption.end.date(), 197 | product_disruption.relative_impact, 198 | product_disruption.absolute_impact, 199 | report_url, 200 | )) 201 | # return 202 | # report_url = _make_report_url( 203 | # disruption.start, disruption.end, disruption.region_code, product_id) 204 | # print("%s,%s,%s,%s,%s,%f,%f,%s" % ( 205 | # disruption.start.date().isoformat(), disruption.end.date().isoformat(), 206 | # disruption.region_code, disruption.product_id.value, 207 | # disruption.product_id.name, disruption.relative_impact, 208 | # disruption.absolute_impact, report_url)) 209 | 210 | 211 | def find_all_disruptions(repo: traffic.TrafficRepository, 212 | regions: Iterable[str], products: Iterable[traffic.ProductId]) -> List[model.RegionDisruption]: 213 | """Returns a list of all region disruptions for the given regions and analyzing the given products only.""" 214 | # TODO: Investigate why YouTube is not output for these outages: 215 | # TG 2017-09-20 2017-09-21 216 | # BLOGGER, 2017-09-20, 2017-09-21, 2.085934, 0.115080, https://transparencyreport.google.com/traffic/overview?lu=fraction_traffic&fraction_traffic=product:2;start:1505534400000;end:1506398400000;region:TG 217 | # WEB_SEARCH, 2017-09-20, 2017-09-21, 1.388299, 0.223981, https://transparencyreport.google.com/traffic/overview?lu=fraction_traffic&fraction_traffic=product:19;start:1505534400000;end:1506398400000;region:TG 218 | # ET 2017-05-31 2017-06-07 219 | # TRANSLATE, 2017-05-31, 2017-06-02, 2.786339, 0.203082, https://transparencyreport.google.com/traffic/overview?lu=fraction_traffic&fraction_traffic=product:16;start:1495684800000;end:1496980800000;region:ET 220 | # WEB_SEARCH, 2017-05-31, 2017-06-07, 5.233837, 1.615268, https://transparencyreport.google.com/traffic/overview?lu=fraction_traffic&fraction_traffic=product:19;start:1494820800000;end:1498276800000;region:ET 221 | 222 | all_disruptions = [] # type: List[model.RegionDisruption] 223 | for region_code in regions: 224 | product_disruptions = [] # type: List[model.ProductDisruption] 225 | for product_id in products: 226 | try: 227 | if product_id == traffic.ProductId.UNKNOWN: 228 | continue 229 | logging.info("Processing region %s product %s", 230 | region_code, product_id.name) 231 | 232 | full_time_series = repo.get_traffic(region_code, product_id) 233 | if full_time_series.empty: 234 | logging.info( 235 | "Empty time series for region %s product %s", region_code, product_id.name) 236 | continue 237 | 238 | daily_time_series = full_time_series.resample("D").mean() 239 | anomalies = find_anomalies(daily_time_series) 240 | if not anomalies: 241 | logging.info("Found no anomalies") 242 | continue 243 | grouped_disruptions = group_as_product_disruptions( 244 | product_id, anomalies, datetime.timedelta(days=3)) 245 | major_grouped_disruptions = remove_minor_disruptions( 246 | grouped_disruptions) 247 | logging.info("Found %d major product disruptions from %d disruptions and %d anomalies", 248 | len(major_grouped_disruptions), len(grouped_disruptions), len(anomalies)) 249 | product_disruptions.extend(major_grouped_disruptions) 250 | except Exception as error: 251 | logging.info("Error processing region %s, product %s: %s", region_code, product_id.name, str(error)) 252 | region_disruptions = group_as_regional_disruptions( 253 | region_code, product_disruptions) 254 | logging.info("Found %d region disruptions from %d product disruptions for %s", len( 255 | region_disruptions), len(product_disruptions), region_code) 256 | all_disruptions.extend(region_disruptions) 257 | return all_disruptions 258 | 259 | 260 | def main(args): 261 | repo = FileTrafficRepository(args.traffic_data) 262 | if args.products: 263 | product_id_list = [traffic.ProductId[ps.strip().upper()] for ps in args.products.split(",")] 264 | else: 265 | product_id_list = [p for p in traffic.ProductId if p.value != traffic.ProductId.UNKNOWN] 266 | 267 | try: 268 | all_disruptions: List[model.RegionDisruption] = find_all_disruptions( 269 | repo, repo.list_regions(), product_id_list) 270 | except KeyboardInterrupt: 271 | pass 272 | 273 | logging.info("Found %d total region disruptions", len(all_disruptions)) 274 | all_disruptions.sort(reverse=True, key=lambda d: (d.start, d.end)) 275 | for region_disruption in all_disruptions: 276 | print_disruption_csv(region_disruption) 277 | 278 | 279 | if __name__ == "__main__": 280 | parser = argparse.ArgumentParser( 281 | description="Finds anomalies in traffic data") 282 | parser.add_argument("--traffic_data", type=str, required=True, help="The base directory of the traffic data") 283 | parser.add_argument("--products", type=str, 284 | help="Comma-separated list of the products to analyze") 285 | sys.exit(main(parser.parse_args())) 286 | -------------------------------------------------------------------------------- /netanalysis/traffic/analysis/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2019 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import datetime 18 | from typing import List 19 | 20 | from netanalysis.traffic.data import model as traffic 21 | 22 | 23 | class AnomalyPoint(object): 24 | """A single timeline point outside the expected range. 25 | 26 | Attributes: 27 | timestamp: The time of the anomaly 28 | traffic: The observed traffic number 29 | expected: What traffic number was expected 30 | absolute_impact: expected - traffic 31 | relative_impact: absolute_impact / mean traffic 32 | """ 33 | 34 | def __init__(self, timestamp: datetime.datetime, traffic: float, 35 | expected: float, relative_impact: float) -> None: 36 | self.timestamp = timestamp 37 | self.traffic = traffic 38 | self.expected = expected 39 | self.absolute_impact = self.expected - self.traffic 40 | self.relative_impact = relative_impact 41 | 42 | def __repr__(self) -> str: 43 | return "AnomalyPoint(%s)" % repr(self.__dict__) 44 | 45 | 46 | class ProductDisruption(object): 47 | """ A disruption to a product represented by a sequence of anomalous points. 48 | 49 | This refers to a single region, which is implicit. 50 | 51 | Attributes: 52 | product_id: The ProductId of the product this disruption is about 53 | start: Time of the first anomaly point 54 | end: Time of the last anomaly point 55 | anomalies: List of all observed anomalies 56 | absolute_impact: Sum of the absolute impact of all anomalies 57 | relative_impact: Sum of the relative impact of all anomalies 58 | """ 59 | 60 | def __init__(self, product_id: traffic.ProductId) -> None: 61 | self.product_id = product_id 62 | self.start = datetime.datetime.max 63 | self.end = datetime.datetime.min 64 | self.anomalies: List[AnomalyPoint] = [] 65 | self.absolute_impact = 0.0 66 | self.relative_impact = 0.0 67 | 68 | def add_anomaly(self, anomaly: AnomalyPoint) -> None: 69 | self.anomalies.append(anomaly) 70 | self.start = min(self.start, anomaly.timestamp) 71 | self.end = max(self.end, anomaly.timestamp) 72 | self.relative_impact += anomaly.relative_impact 73 | self.absolute_impact += anomaly.absolute_impact 74 | 75 | def __repr__(self) -> str: 76 | return "ProductDisruption(%s)" % repr(self.__dict__) 77 | 78 | 79 | class RegionDisruption(object): 80 | """A disruption to traffic in a region. 81 | 82 | The region disruption is represented by overlapping disruptions of 83 | multiple products in that region. 84 | 85 | Attributes: 86 | region_code: The country code of the region this disruption is about. 87 | start: Time of the first anomaly point 88 | end: Time of the last anomaly point 89 | product_disruptions: The list of all observed ProductDisruptions 90 | """ 91 | 92 | def __init__(self, region_code: str) -> None: 93 | self.region_code = region_code 94 | self.start = datetime.datetime.max 95 | self.end = datetime.datetime.min 96 | self.product_disruptions: List[ProductDisruption] = [] 97 | 98 | def add_product_disruption(self, product_disruption: ProductDisruption) -> None: 99 | self.product_disruptions.append(product_disruption) 100 | self.start = min(self.start, product_disruption.start) 101 | self.end = max(self.end, product_disruption.end) 102 | 103 | def __repr__(self) -> str: 104 | return "RegionDisruption(%s)" % repr(self.__dict__) 105 | -------------------------------------------------------------------------------- /netanalysis/traffic/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/netanalysis/traffic/data/__init__.py -------------------------------------------------------------------------------- /netanalysis/traffic/data/api_repository.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2019 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | Library to access Google's traffic data from its Transparency Report 19 | """ 20 | import datetime 21 | import json 22 | import ssl 23 | import time 24 | from urllib.request import urlopen, Request 25 | from urllib.parse import urlencode, quote 26 | 27 | import certifi 28 | import pandas as pd 29 | 30 | from netanalysis.traffic.data import model 31 | 32 | 33 | def _to_timestamp(time_point: datetime.datetime): 34 | return time.mktime(time_point.timetuple()) 35 | 36 | 37 | _SSL_CONTEXT = ssl.create_default_context(cafile=certifi.where()) 38 | 39 | 40 | class ApiTrafficRepository(model.TrafficRepository): 41 | """TrafficRepository that reads the traffic data from Google's Transparency Report.""" 42 | 43 | def _query_api(self, endpoint, params=None): 44 | query_url = "https://transparencyreport.google.com/transparencyreport/api/v3/traffic/" + \ 45 | quote(endpoint) 46 | if params: 47 | query_url = query_url + "?" + urlencode(params) 48 | try: 49 | request = Request(query_url) 50 | request.add_header("User-Agent", "Jigsaw-Code/netanalysis") 51 | with urlopen(request, context=_SSL_CONTEXT) as response: 52 | return json.loads(response.read()[6:].decode("utf8")) 53 | except Exception as error: 54 | raise Exception("Failed to query url %s" % query_url, error) 55 | 56 | def list_regions(self): 57 | response_proto = self._query_api("regionlist") 58 | return sorted([e[0] for e in response_proto[0][1]]) 59 | 60 | def get_traffic(self, region_code: str, product_id: model.ProductId, 61 | start: datetime.datetime = None, end: datetime.datetime = None): 62 | DEFAULT_INTERVAL_DAYS = 2 * 365 63 | POINTS_PER_DAY = 48 64 | if not end: 65 | end = datetime.datetime.now() 66 | if not start: 67 | start = end - datetime.timedelta(days=DEFAULT_INTERVAL_DAYS) 68 | number_of_days = (end - start).days 69 | total_points = int(number_of_days * POINTS_PER_DAY) 70 | entries = [] 71 | params = [ 72 | ("start", int(_to_timestamp(start) * 1000)), 73 | ("end", int(_to_timestamp(end) * 1000)), 74 | ("width", total_points), 75 | ("product", product_id.value), 76 | ("region", region_code)] 77 | response_proto = self._query_api("fraction", params) 78 | entry_list_proto = response_proto[0][1] 79 | for entry_proto in entry_list_proto: 80 | timestamp = datetime.datetime.utcfromtimestamp( 81 | entry_proto[0] / 1000) 82 | value = entry_proto[1][0][1] 83 | entries.append((timestamp, value / POINTS_PER_DAY / 2)) 84 | dates, traffic = zip(*entries) 85 | return pd.Series(traffic, index=dates) 86 | -------------------------------------------------------------------------------- /netanalysis/traffic/data/fetch_google_traffic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2019 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | Get the traffic data from the Google Transparency Report and save as CSV. 19 | 20 | It will save a file for each region as ${OUTPUT_DIR}/[REGION_CODE]/[PRODUCT_NAME]:[PRODUCT_CODE].csv 21 | """ 22 | import argparse 23 | import csv 24 | import datetime 25 | import logging 26 | import os 27 | import sys 28 | 29 | from netanalysis.traffic.data import model 30 | import netanalysis.traffic.data.api_repository as api 31 | 32 | logging.getLogger().setLevel(logging.INFO) 33 | 34 | 35 | def main(args): 36 | if not args.output_dir: 37 | logging.error("Need to specify output directory") 38 | return 1 39 | if not os.path.exists(args.output_dir): 40 | os.makedirs(args.output_dir) 41 | 42 | report: model.TrafficRepository = api.ApiTrafficRepository() 43 | if args.products: 44 | product_id_list = [model.ProductId[ps.strip().upper()] for ps in args.products.split(",")] 45 | else: 46 | product_id_list = [p for p in model.ProductId if p.value != model.ProductId.UNKNOWN] 47 | region_code_list = report.list_regions() 48 | end_time = datetime.datetime.now() 49 | start_time = end_time - datetime.timedelta(days=5*365) 50 | for region_code in region_code_list: 51 | logging.info("Processing region %s", region_code) 52 | output_region_directory = os.path.join(args.output_dir, region_code) 53 | if not os.path.exists(output_region_directory): 54 | os.makedirs(output_region_directory) 55 | 56 | for product_id in product_id_list: 57 | logging.info("Fetching traffic data for region %s product %s", region_code, product_id.name) 58 | csv_filename = os.path.join(output_region_directory, "%s.csv" % product_id.name) 59 | if os.path.exists(csv_filename): 60 | logging.info("Traffic data already available for %s in %s. Skipping...", 61 | product_id.name, region_code) 62 | continue 63 | try: 64 | traffic_series = report.get_traffic(region_code, product_id, start_time, end_time) 65 | if traffic_series.empty: 66 | logging.info("No traffic for product %s in region %s", product_id.name, region_code) 67 | continue 68 | with open(csv_filename, "w") as csv_file: 69 | writer = csv.writer(csv_file) 70 | for entry in traffic_series.iteritems(): 71 | writer.writerow((entry[0].isoformat(), entry[1])) 72 | except Exception as error: 73 | logging.warning("Failed to get traffic for %s %s: %s", 74 | region_code, product_id.name, str(error)) 75 | return 0 76 | 77 | 78 | if __name__ == "__main__": 79 | parser = argparse.ArgumentParser( 80 | description='Fetches traffic data from the Google Transparency Report as CSV') 81 | parser.add_argument("--output_dir", type=str, required=True, help='The base directory for the output') 82 | parser.add_argument("--products", type=str, 83 | help="Comma-separated list of the products to get traffic for") 84 | sys.exit(main(parser.parse_args())) 85 | -------------------------------------------------------------------------------- /netanalysis/traffic/data/file_repository.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2019 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import os 18 | from typing import Iterable 19 | 20 | import pandas as pd 21 | 22 | from netanalysis.traffic.data import model 23 | 24 | 25 | class FileTrafficRepository(model.TrafficRepository): 26 | """TrafficRepository that reads the traffic data from previously downloaded files""" 27 | 28 | def __init__(self, base_directory: str) -> None: 29 | self.base_directory = base_directory 30 | 31 | def list_regions(self) -> Iterable[str]: 32 | return sorted(os.listdir(self.base_directory)) 33 | 34 | def get_traffic(self, region_code: str, product_id: model.ProductId) -> pd.Series: 35 | filename = os.path.join(self.base_directory, region_code, "%s.csv" % product_id.name) 36 | try: 37 | return pd.read_csv(filename, parse_dates=True, squeeze=True, 38 | index_col="timestamp", names=["timestamp", "traffic"]) 39 | except FileNotFoundError: 40 | return pd.DataFrame() 41 | -------------------------------------------------------------------------------- /netanalysis/traffic/data/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2019 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | Model for traffic data repositories. 19 | """ 20 | import abc 21 | import datetime 22 | from enum import IntEnum 23 | from typing import Iterable 24 | 25 | import pandas as pd 26 | 27 | 28 | class ProductId(IntEnum): 29 | UNKNOWN = 0 30 | ALL = 1 31 | BLOGGER = 2 32 | BOOKS = 3 33 | DOCS = 4 34 | EARTH = 5 35 | GMAIL = 6 36 | GROUPS = 7 37 | IMAGES = 8 38 | MAPS = 9 39 | ORKUT = 11 40 | PICASA_WEB_ALBUMS = 12 41 | SITES = 14 42 | SPREADSHEETS = 15 43 | TRANSLATE = 16 44 | VIDEOS = 18 45 | WEB_SEARCH = 19 46 | YOUTUBE = 21 47 | 48 | 49 | class TrafficRepository(abc.ABC): 50 | @abc.abstractmethod 51 | def list_regions(self) -> Iterable[str]: 52 | pass 53 | 54 | @abc.abstractmethod 55 | def get_traffic(self, region_code: str, product_id: ProductId, 56 | start: datetime.datetime = None, end: datetime.datetime = None 57 | ) -> pd.Series: 58 | pass 59 | -------------------------------------------------------------------------------- /pipenv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | # 3 | # Copyright 2017 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | PIPENV_VENV_IN_PROJECT=1 pipenv "$@" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Jigsaw Operations LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import setuptools 18 | 19 | with open("README.md", "r") as readme: 20 | long_description = readme.read() 21 | 22 | setuptools.setup( 23 | name="jigsaw-net-analysis", 24 | version="0.1.0", 25 | author="Jigsaw Operations, LLC", 26 | description="Network analysis tools", 27 | long_description=long_description, 28 | long_description_content_type="text/markdown", 29 | url="https://github.com/Jigsaw-Code/net-analysis", 30 | packages=setuptools.find_packages(), 31 | classifiers=[ 32 | "Programming Language :: Python :: 3", 33 | "License :: OSI Approved :: Apache Software License", 34 | "Operating System :: OS Independent", 35 | "Topic :: Internet", 36 | "Topic :: Software Development :: Libraries", 37 | "Topic :: System :: Networking :: Monitoring" 38 | ], 39 | python_requires='>=3.6.9', 40 | install_requires=[ 41 | "boto3", 42 | "cchardet", 43 | "certifi", 44 | "iso3166", 45 | "jupyter", 46 | "lz4", 47 | "networkx", 48 | "geoip2", 49 | "google-cloud-bigquery", 50 | "matplotlib", 51 | "pandas", 52 | "pydot", 53 | "scipy", 54 | "statsmodels", 55 | "ujson" 56 | ], 57 | include_package_data=True, 58 | ) 59 | -------------------------------------------------------------------------------- /third_party/caida.org/README.md: -------------------------------------------------------------------------------- 1 | Data from 2 | * http://www.caida.org/data/as-classification/ 3 | * http://www.caida.org/data/as-organizations/ 4 | -------------------------------------------------------------------------------- /third_party/caida.org/as-classification/METADATA: -------------------------------------------------------------------------------- 1 | name: "as-classification" 2 | third_party { 3 | url { 4 | type: HOMEPAGE 5 | value: "http://www.caida.org/data/as-classification/" 6 | } 7 | url { 8 | type: ARCHIVE 9 | value: "http://data.caida.org/datasets/as-classification/20200801.as2types.txt.gz" 10 | } 11 | version: "20200801" 12 | last_upgrade_date { year: 2020 month: 8 day: 18 } 13 | } 14 | -------------------------------------------------------------------------------- /third_party/caida.org/as-classification/as2types.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/third_party/caida.org/as-classification/as2types.txt.gz -------------------------------------------------------------------------------- /third_party/caida.org/as-organizations/METADATA: -------------------------------------------------------------------------------- 1 | name: "as-organizations" 2 | third_party { 3 | url { 4 | type: HOMEPAGE 5 | value: "http://www.caida.org/data/as-organizations/" 6 | } 7 | url { 8 | type: ARCHIVE 9 | value: "http://data.caida.org/datasets/as-organizations/20200701.as-org2info.txt.gz" 10 | } 11 | version: "20200701" 12 | last_upgrade_date { year: 2020 month: 8 day: 18 } 13 | } 14 | -------------------------------------------------------------------------------- /third_party/caida.org/as-organizations/README.txt: -------------------------------------------------------------------------------- 1 | This directory contains files mapping Autonmous Systems (AS) to 2 | the their Organizations (Org). 3 | 4 | http://www.caida.org/research/topology/as2org/ 5 | http://www.caida.org/data/as-organizations/ 6 | 7 | 8 | The as2org files contain two different types of entries: AS numbers and 9 | organizations. The two data types are divided by lines that start with 10 | '# format....'. An example can be found below. 11 | 12 | # format: aut|changed|name|org_id|source 13 | 1|20120224|LVLT-1|LVLT-ARIN|ARIN 14 | # format: org_id|changed|name|country|source 15 | LVLT-ARIN|20120130|Level 3 Communications, Inc.|US|ARIN 16 | 17 | ---------- 18 | AS fields 19 | ---------- 20 | aut : the AS number 21 | changed : the changed date provided by its WHOIS entry 22 | name : the name provide for the individual AS number 23 | org_id : maps to an organization entry 24 | source : the RIR or NIR database which was contained this entry 25 | 26 | -------------------- 27 | Organization fields 28 | -------------------- 29 | org_id : unique ID for the given organization 30 | some will be created by the WHOIS entry and others will be 31 | created by our scripts 32 | changed : the changed date provided by its WHOIS entry 33 | name : name could be selected from the AUT entry tied to the 34 | organization, the AUT entry with the largest customer cone, 35 | listed for the organization (if there existed an stand alone 36 | organization), or a human maintained file. 37 | country : some WHOIS provide as a individual field. In other cases 38 | we inferred it from the addresses 39 | source : the RIR or NIR database which was contained this entry 40 | 41 | ------------------------ 42 | Acceptable Use Agreement 43 | ------------------------ 44 | 45 | the AUA that you accepted when you were given access to these datas is 46 | included in pdf format as a separate file in the same directory as this 47 | README file. When referencing this data (as required by the AUA), 48 | please use: 49 | 50 | The CAIDA AS Organizations Dataset, 51 | http://www.caida.org/data/as-organizations 52 | 53 | Also, please, report your publication to CAIDA 54 | (http://www.caida.org/data/publications/report-publication.xml). 55 | 56 | ------------------- 57 | Updating CAIDA Data 58 | ------------------- 59 | See https://github.com/Jigsaw-Code/net-analysis/pull/11 as an example for how to pull fresh data. -------------------------------------------------------------------------------- /third_party/caida.org/as-organizations/as-org2info.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/third_party/caida.org/as-organizations/as-org2info.txt.gz -------------------------------------------------------------------------------- /third_party/caida.org/caida_pub_aua.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/third_party/caida.org/caida_pub_aua.pdf -------------------------------------------------------------------------------- /third_party/db-ip/dbip-asn-lite/COPYRIGHT.txt: -------------------------------------------------------------------------------- 1 | IP Geolocation by DB-IP -------------------------------------------------------------------------------- /third_party/db-ip/dbip-asn-lite/LICENSE.txt: -------------------------------------------------------------------------------- 1 | This database is licensed under a Creative Commons Attribution 4.0 International License (https://creativecommons.org/licenses/by/4.0/) 2 | -------------------------------------------------------------------------------- /third_party/db-ip/dbip-asn-lite/METADATA: -------------------------------------------------------------------------------- 1 | name: "dbip-country-lite" 2 | third_party { 3 | url { 4 | type: HOMEPAGE 5 | value: "https://db-ip.com/db/download/ip-to-asn-lite" 6 | } 7 | url { 8 | type: ARCHIVE 9 | value: "https://download.db-ip.com/free/dbip-asn-lite-2022-02.mmdb.gz" 10 | } 11 | version: "2022-02" 12 | last_upgrade_date { year: 2022 month: 2 day: 23 } 13 | } 14 | -------------------------------------------------------------------------------- /third_party/db-ip/dbip-asn-lite/dbip-asn-lite.mmdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/third_party/db-ip/dbip-asn-lite/dbip-asn-lite.mmdb -------------------------------------------------------------------------------- /third_party/db-ip/dbip-country-lite/COPYRIGHT.txt: -------------------------------------------------------------------------------- 1 | IP Geolocation by DB-IP -------------------------------------------------------------------------------- /third_party/db-ip/dbip-country-lite/LICENSE.txt: -------------------------------------------------------------------------------- 1 | This database is licensed under a Creative Commons Attribution 4.0 International License (https://creativecommons.org/licenses/by/4.0/) 2 | -------------------------------------------------------------------------------- /third_party/db-ip/dbip-country-lite/METADATA: -------------------------------------------------------------------------------- 1 | name: "dbip-country-lite" 2 | third_party { 3 | url { 4 | type: HOMEPAGE 5 | value: "https://db-ip.com/db/download/ip-to-country-lite" 6 | } 7 | url { 8 | type: ARCHIVE 9 | value: "https://download.db-ip.com/free/dbip-country-lite-2022-02.mmdb.gz" 10 | } 11 | version: "2022-02" 12 | last_upgrade_date { year: 2022 month: 2 day: 23 } 13 | } 14 | -------------------------------------------------------------------------------- /third_party/db-ip/dbip-country-lite/dbip-country-lite.mmdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jigsaw-Code/net-analysis/61dca80a2d4600378337e7bac440f05e6d1b6b9e/third_party/db-ip/dbip-country-lite/dbip-country-lite.mmdb -------------------------------------------------------------------------------- /tools/ipython.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Jigsaw Operations LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from IPython import embed 16 | 17 | embed() -------------------------------------------------------------------------------- /tools/jupyter.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Jigsaw Operations LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from notebook import notebookapp 16 | 17 | notebookapp.main() 18 | --------------------------------------------------------------------------------