├── .gitignore
├── LICENSE.md
├── README.md
├── bin
    ├── daily.sh
    ├── daily_ct.sh
    ├── download.rb
    ├── download.sh
    ├── normalize.rb
    └── normalize.sh
├── conf
    └── inetdata.json.sample
├── lib
    ├── inetdata.rb
    └── inetdata
    │   ├── config.rb
    │   ├── logger.rb
    │   ├── source.rb
    │   └── source
    │       ├── arin.rb
    │       ├── base.rb
    │       ├── caida_prefix2as.rb
    │       ├── censys_certs.rb
    │       ├── censys_ipv4.rb
    │       ├── ct.rb
    │       ├── czds.rb
    │       ├── gov.rb
    │       ├── govuk.rb
    │       ├── premiumdrops.rb
    │       ├── rir.rb
    │       ├── sonar.rb
    │       ├── whoisxmlapi.rb
    │       └── wwwsio.rb
└── logs
    └── .keep


/.gitignore:
--------------------------------------------------------------------------------
1 | conf/inetdata.json
2 | data/cache
3 | logs
4 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Special Circumstances, LLC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Internet Data Download
  2 | 
  3 | Download and normalize internet data from various sources. This package is normally run on a daily basis (after 10:00am CST).
  4 | 
  5 | ## Dependencies
  6 | 
  7 | ### Ubuntu
  8 |   * sudo apt-get install coreutils build-essential libssl-dev curl gnupg pigz liblz4-tool
  9 | 
 10 | ### Ruby
 11 | 
 12 | #### Ubuntu 16.04 LTS
 13 |  * sudo apt-get install ruby
 14 | 
 15 | #### Other Distributions
 16 |   * gpg --keyserver hkp://keys.gnupg.net --recv-keys 409B6B1796C275462A1703113804BB82D39DC0E3
 17 |   * \curl -sSL https://get.rvm.io | bash -s stable --ruby=2.3.3
 18 | 
 19 | ### inetdata-parsers
 20 | 
 21 | The normalization process depends on the tools provided by the inetdata-parsers project. Please see the [README](https://github.com/hdm/inetdata-parsers/) for more information. The inetdata-parsers tools need to be in the system path for the normalization process to complete.
 22 | 
 23 | ### System Limits
 24 | 
 25 | The normalization process requires a large number of open file handles. If the normalizer is run as root, it will attempt to modify rlimit automatically. If the normalizer is run as a non-privileged user, the following changes need to be made:
 26 | 
 27 | * Update /etc/security/limits.conf to include:
 28 | ```
 29 | *    soft nofile 65536
 30 | *    hard nofile 65536
 31 | root soft nofile 65536
 32 | root hard nofile 65536
 33 | ```
 34 | 
 35 | * Log out and log back in
 36 | * Check the output of ```ulimit -n``` to ensure that the soft and hard limits are updated
 37 | 
 38 | 
 39 | ## Configuration
 40 | 
 41 | A sample configuration file is provided in ``conf/inetdata.json.sample``. This should be copied to ``conf/inetdata.json`` and updated with your credentials and specific settings for your environment. Credential configuration is described in the *Data Sources* section below. The following system settings are important and should be tuned as needed:
 42 | 
 43 | ```json
 44 |   "storage": "./data/cache/",
 45 |   "logs": "./data/logs/",
 46 |   "log_stderr": true,
 47 |   "log_debug": false,
 48 |   "DISABLED_max_ram": "16G",
 49 |   "DISABLED_max_cores" : "4",
 50 | ```
 51 |   * *storage*: The storage parameter determines where daily downloads and normalized files are saved. For a typical install, this will consume around 1Tb/mo, but may be more or less depending on what sources are enabled. Keep in mind that the normalizer doesn't delete the source data and search-optimized files (such as MTBLs) can be even larger than the original.
 52 | 
 53 |   * *logs*: The logs parameter determines where output from the download and normalize jobs are saved. This requires a minimal amount of storage (300M/mo).
 54 | 
 55 |   * *log_stderr*: The log_stderr parameter controls whether or not the download and normalize jobs print to stderr as well as the log. This is useful to enable when running the download or normalize scripts on the command line.
 56 | 
 57 |   * *log_debug*: The log_debug parameter controls whether or not the download and normalize jobs log additional output that is helpful for diagnostics.
 58 | 
 59 |   * *max_ram*: The max_ram parameter determines how much memory is used for the normalize jobs. The "DISABLED_" prefix should be removed and the value set to approximately half of system memory. The normalizer will not work well on systems with less than 16gb of memory.
 60 | 
 61 |   * *max_cores*: The max_cores parameter determines how many threads to use for the normalize jobs. The "DISABLED_" prefix should be removed and the value set to the number of cores that can dedicated to this job. The default is to use all available cores when possible. The "nice" command is used to lower the priority of the normalize job, making this relatively safe to use on shared systems.
 62 | 
 63 | 
 64 | ## Usage
 65 | 
 66 | Once configured and tested a cronjob should be created for bin/daily.sh in the following format. This assumes that the system is in the central time zone (CST). The cronjob should not be run prior to 10:00am CST due to the schedule of common sources. The download and normalization process can take up to 18 hours or longer, especially on slow systems and when larger files are retrieved.
 67 | ```
 68 | 10 0 * * * /path/to/inetdata/daily.sh 2>&1 /path/to/inetdata/logs/cronjob.log
 69 | ```
 70 | 
 71 | Download jobs can be run manually through ``bin/download.sh``. To select which sources to download, specify a comma-separated list with the ``-s`` parameter. You can see a list of all enabled sources by running this script with the ``-l`` parameter.
 72 | 
 73 | Normalize jobs can be run manually through ``bin/normalize.sh``. To select which sources to normalize, specify a comma-separated list with the ``-s`` parameter. You can see a list of all enabled sources by running this script with the ``-l`` parameter.
 74 | 
 75 | ## Data Sources
 76 | 
 77 | | Name          | Description     | Price |
 78 | | ------------- |:-------------:| -----:|
 79 | | [Sonar](https://scans.io) | FDNS, RDNS, UDP, TCP, TLS, HTTP, HTTPS scan data  |  FREE |
 80 | | [Censys.io](https://www.censys.io/)| TCP, TLS, HTTP, HTTPS scan data    | FREE (non-commercial) |
 81 | | [CT](https://www.certificate-transparency.org/)| TLS | FREE |
 82 | | [CZDS](https://czds.icann.org/) | DNS zone files for "new" global TLDs  | FREE |
 83 | | [ARIN](https://www.arin.net) | American IP registry information (ASN, Org, Net, Poc) | FREE |
 84 | | [CAIDA PFX2AS IPv4](http://data.caida.org/datasets/routing/routeviews-prefix2as) | Daily snapshots of ASN to IPv4 mappings | FREE |
 85 | | [CAIDA PFX2AS IPv6](http://data.caida.org/datasets/routing/routeviews6-prefix2as) | Daily snapshots of ASN to IPv6 mappings | FREE |
 86 | | [US Gov](https://raw.githubusercontent.com/GSA/data/gh-pages/dotgov-domains/current-full.csv) | US government domain names | FREE |
 87 | | [UK Gov](https://www.gov.uk/government/publications/list-of-gov-uk-domain-names) | UK government domain names | FREE |
 88 | | [RIR Delegations](http://ftp.arin.net/pub/stats/) | Regional IP allocations | FREE |
 89 | | [PremiumDrops](http://premiumdrops.com/) | DNS zone files for com/net/info/org/biz/xxx/sk/us TLDs  | $24.95/mo |
 90 | | [WhoisXMLAPI.com](https://WhoisXMLAPI.com/)  | New domain whois data  | $109/mo |
 91 | 
 92 | ### Sonar
 93 | 
 94 | Project Sonar is a community project sponsored by Rapid7. The latest data can be found at [https://scans.io/](https://scans.io/). More information about Project Sonar can be found on the offical [website](https://sonar.labs.rapid7.com/).
 95 | 
 96 | The download script pulls down the sonar.fdns and sonar.rdns datasets, which are updated weekly. In addition, this project pulls down the sonar.ssl and sonar.moressl "names" files (but not the rest of the certificate data). The normalization process converts the sonar.fdns and sonar.rdns files into a set of
 97 | CSVs and MTBLs. These include both a forward and reverse lookup. These normalized files can be queried using standard unix utilities or MTBL front-ends such as mtbl_dump, rmtbl_dump, and mq.
 98 | 
 99 | 
100 | ### Censys
101 | 
102 | The download script pulls down the weekly IPv4 file when configured with credentials. Unfortunately, due to the capped download speed and size of the file, this must be excluded from automatic downloads, since the download process can take more than 24 hours, and the normalize another 8-12 hours. To configure this data
103 | source, register for an account at https://censys.io/ and fill in the following two fields in conf/inetdata.json:
104 | 
105 | ```json
106 | {
107 |   "censys_api_id": "<censys-api-id>",
108 |   "censys_secret": "<censys-secret>",
109 | }
110 | ```
111 | 
112 | Specify the censys source (``-s censys``) to use this with ``bin/download.rb`` or ``bin/normalize.rb``
113 | 
114 | 
115 | ### Certificate Transparency
116 | 
117 | The download script pulls down the full CT logs. Unfortunately, due to size of these logs this must be excluded from automatic downloads, since the download process can take more than 12 hours, and the normalize another 5-12 hours. 
118 | 
119 | Specify the ct source (``-s ct``) to use this with ``bin/download.rb`` or ``bin/normalize.rb``
120 | 
121 | ### CZDS (ICANN)
122 | 
123 | The download script pulls down all available CZDS zone files. To configure this data source, register for an account at https://czds.icann.org/, then apply for access to all desired zones through the CZDS web portal. The download script will automatically pull down all approved zones for your account. Once an account
124 | has been registered, fill in the czds_token field in conf/inetdata.json:
125 | 
126 | ```json
127 | {
128 |   "czds_token": "<token>",
129 | }
130 | ```
131 | 
132 | 
133 | ### ARIN (Bulk Data)
134 | 
135 | The download script pulls down the daily nets, pocs, orgs, and asns file from ARIN. This requires the completion of a [bulk access agreement](https://www.arin.net/resources/agreements/bulkwhois.pdf), which needs to be physically mailed and approved by the ARIN team. Although most common use cases can be handled through ARIN REST API, any automation that requires fuzzy matching or an extreme number of queries is better handled through bulk data. Once your account is enabled for bulk data, fill in the arin_api_key field in conf/inetdata.json:
136 | 
137 | 
138 | ```json
139 | {
140 |   "arin_api_key": "API-<key>",
141 | }
142 | ```
143 | 
144 | ### PremiumDrops
145 | 
146 | The download script pulls down the daily full zone files for following TLDs: com, net, info, org, biz, xxx, sk, and us. This is a commercial service and requires a monthly subscription fee to access. PremiumDrops provides the daily zone at 9:00am CST each day and any cronjob that automates the download should be scheduled after this time. The normalization process converts the zones files into a set of CSVs and MTBLs. These include both a forward and reverse lookup. These normalized files can be queried using standard unix utilities or MTBL front-ends such as mtbl_dump, rmtbl_dump, and mq.
147 | 
148 | Note that while PremiumDrops supports TLS for most things, the actual data download must be over a clear-text connection.
149 | Once an account has been registered, fill in the followingt credentials in conf/inetdata.json:
150 | 
151 | 
152 | ```json
153 | {
154 |   "premiumdrops_username": "<email-address>",
155 |   "premiumdrops_password": "<password>",
156 | }
157 | ```
158 | 
159 | 
160 | ### WhoisXMLAPI (New Domains)
161 | 
162 | The download script can pull down whois information for new domains from the WhoisXMLAPI.com service. This is a commercial service and requires a monthly subscription fee to access. WhoisXMLAPI's updates are ready at 10:00am CST each day and any cronjob that automates the download should be scheduled after this time.
163 | 
164 | To sign up for new domain whois access, visit [https://www.whoisxmlapi.com/new-domain-pricing.php](https://www.whoisxmlapi.com/new-domain-pricing.php). The Enterprise plan ($109/mo) provides whois data for new daily domain registrations. Once an account has been registered, a separate username and password will be emailed to you with download credentials. Add those credentials to conf/inetdata.json:
165 | 
166 | 
167 | ```json
168 | {
169 |   "whoisxmlapi_username": "<email-address>",
170 |   "whoisxmlapi_password": "<download-password>",
171 | }
172 | ```
173 | 


--------------------------------------------------------------------------------
/bin/daily.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | THIS=$(readlink -f "$(dirname "$(readlink -f "$0")")/")
4 | ${THIS}/download.sh && ${THIS}/normalize.sh
5 | 
6 | 


--------------------------------------------------------------------------------
/bin/daily_ct.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | THIS=$(readlink -f "$(dirname "$(readlink -f "$0")")/")
4 | ${THIS}/download.sh -s ct # && ${THIS}/normalize.sh -s ct
5 | 
6 | 


--------------------------------------------------------------------------------
/bin/download.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | BASE_PATH = File.expand_path(File.join(File.dirname(__FILE__), ".."))
 3 | $LOAD_PATH.unshift(File.join(BASE_PATH, 'lib'))
 4 | 
 5 | require 'inetdata'
 6 | require 'optparse'
 7 | 
 8 | options = {}
 9 | OptionParser.new do |opts|
10 |   opts.banner = "Usage: download.rb [options]"
11 | 
12 |   opts.on("-l", "--list-sources", "List available sources") do |opt|
13 |     options[:list_sources] = true
14 |   end
15 |   opts.on("-s", "--sources [sources]", "Comma-separated list of sources to download") do |opt|
16 |     options[:selected_sources] = opt.split(/,\s+/).uniq.map{|x| x.downcase}
17 |   end
18 | end.parse!
19 | 
20 | config = InetData::Config.new
21 | logger = InetData::Logger.new(config, 'download')
22 | 
23 | allowed_sources = (InetData::Source.constants - [:Base]).map{|c| InetData::Source.const_get(c) }
24 | sources = []
25 | 
26 | allowed_sources.each do |sname|
27 |   s = sname.new(config)
28 |   if ! s.available?
29 |     logger.log("Warning: Source #{s.name} is disabled due to configuration")
30 |     next
31 |   end
32 | 
33 |   if s.manual? && (options[:selected_sources].nil? || ! options[:selected_sources].include?(s.name))
34 |     logger.log("Warning: Source #{s.name} must be specified manually")
35 |     next
36 |   end
37 | 
38 |   sources << s
39 | 
40 | end
41 | 
42 | if options[:list_sources]
43 |   $stderr.puts "Available Sources: "
44 |   sources.each do |s|
45 |     $stderr.puts " * #{s.name}"
46 |   end
47 |   exit(1)
48 | end
49 | 
50 | if options[:selected_sources]
51 |   sources = sources.select do |s|
52 |     options[:selected_sources].include?(s.name)
53 |   end
54 | end
55 | 
56 | logger.log("Download initiated with sources: #{sources.map{|s| s.name}.join(", ")}")
57 | 
58 | threads = []
59 | sources.each do |s|
60 |   threads << Thread.new do
61 |     begin
62 |       s.download
63 |     rescue ::Exception
64 |       logger.log("Error: Source #{s.name} threw an exception: #{$!.class} #{$!} #{$!.backtrace}")
65 |     end
66 |   end
67 | end
68 | 
69 | # Wait for all downloads to finish
70 | threads.map{|t| t.join}
71 | 
72 | logger.log("Download completed with sources: #{sources.map{|s| s.name}.join(", ")}")
73 | 


--------------------------------------------------------------------------------
/bin/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -f /etc/profile.d/rvm.sh  ]; then
 4 |   source /etc/profile.d/rvm.sh
 5 | fi
 6 | 
 7 | if [ -f ${HOME}/.bashrc ]; then
 8 |   source ${HOME}/.bashrc
 9 | fi
10 | 
11 | THIS=$(readlink -f "$(dirname "$(readlink -f "$0")")/")
12 | exec ${THIS}/download.rb $@
13 | 


--------------------------------------------------------------------------------
/bin/normalize.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | BASE_PATH = File.expand_path(File.join(File.dirname(__FILE__), ".."))
 3 | $LOAD_PATH.unshift(File.join(BASE_PATH, 'lib'))
 4 | 
 5 | require 'inetdata'
 6 | require 'optparse'
 7 | 
 8 | desired_nofiles = 65535
 9 | 
10 | if InetData::Config.raise_rlimit_nofiles(desired_nofiles) < desired_nofiles
11 |   $stderr.puts %Q|Error: ulimit(nofiles) could not be raised to #{desired_nofiles}
12 | 
13 | Update /etc/security/limits.conf to include:
14 | 
15 | *    soft nofile #{desired_nofiles+1}
16 | *    hard nofile #{desired_nofiles+1}
17 | root soft nofile #{desired_nofiles+1}
18 | root hard nofile #{desired_nofiles+1}
19 | 
20 | Logout, log back in, and check the output of 'ulimit -n'
21 | 
22 | Without this change, normalization jobs may fail without warning
23 | 
24 |   |
25 |   exit(1)
26 | end
27 | 
28 | options = {}
29 | OptionParser.new do |opts|
30 |   opts.banner = "Usage: normalize [options]"
31 | 
32 |   opts.on("-l", "--list-sources", "List available sources") do |opt|
33 |     options[:list_sources] = true
34 |   end
35 |   opts.on("-s", "--sources [sources]", "Comma-separated list of sources to normalize") do |opt|
36 |     options[:selected_sources] = opt.split(/,\s+/).uniq.map{|x| x.downcase}
37 |   end
38 | end.parse!
39 | 
40 | config = InetData::Config.new
41 | logger = InetData::Logger.new(config, 'normalize')
42 | 
43 | allowed_sources = (InetData::Source.constants - [:Base]).map{|c| InetData::Source.const_get(c) }
44 | sources = []
45 | 
46 | allowed_sources.each do |sname|
47 |   s = sname.new(config)
48 |   if ! s.available?
49 |     logger.log("Warning: Source #{s.name} is disabled due to configuration")
50 |     next
51 |   end
52 | 
53 |   if s.manual? && (options[:selected_sources].nil? || ! options[:selected_sources].include?(s.name))
54 |     logger.log("Warning: Source #{s.name} must be specified manually")
55 |     next
56 |   end
57 | 
58 |   sources << s
59 | 
60 | end
61 | 
62 | if options[:list_sources]
63 |   $stderr.puts "Available Sources: "
64 |   sources.each do |s|
65 |     $stderr.puts " * #{s.name}"
66 |   end
67 |   exit(1)
68 | end
69 | 
70 | if options[:selected_sources]
71 |   sources = sources.select do |s|
72 |     options[:selected_sources].include?(s.name)
73 |   end
74 | end
75 | 
76 | logger.log("Normalize initiated with sources: #{sources.map{|s| s.name}.join(", ")}")
77 | 
78 | sources.each do |s|
79 |   begin
80 |     s.normalize
81 |   rescue ::InetData::Source::Base::NotImplemented
82 |     # logger.log("Warning: Source #{s.name} does not implement normalize()")
83 |   rescue ::Interrupt
84 |     logger.log("Error: Source #{s.name} was interrupted: #{$!.class} #{$!} #{$!.backtrace}")
85 |   rescue ::Exception
86 |     logger.log("Error: Source #{s.name} threw an exception: #{$!.class} #{$!} #{$!.backtrace}")
87 |   end
88 | end
89 | 
90 | logger.log("Normalize completed with sources: #{sources.map{|s| s.name}.join(", ")}")
91 | 


--------------------------------------------------------------------------------
/bin/normalize.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -f /etc/profile.d/rvm.sh  ]; then
 4 |   source /etc/profile.d/rvm.sh
 5 | fi
 6 | 
 7 | if [ -f ${HOME}/.bashrc ]; then
 8 |   source ${HOME}/.bashrc
 9 | fi
10 | 
11 | export PATH=/usr/local/bin:$PATH
12 | export GODEBUG=cgocheck=0
13 | 
14 | THIS=$(readlink -f "$(dirname "$(readlink -f "$0")")/")
15 | exec ${THIS}/normalize.rb $@
16 | 


--------------------------------------------------------------------------------
/conf/inetdata.json.sample:
--------------------------------------------------------------------------------
  1 | {
  2 |   "storage": "./data/cache/",
  3 |   "reports": "./data/reports/",
  4 |   "logs": "./data/logs/",
  5 |   "log_stderr": true,
  6 |   "log_debug": false,
  7 | 
  8 |   "DISABLED_max_ram": "16G",
  9 |   "DISABLED_max_cores" : "4",
 10 | 
 11 |   "sonar_base_url": "https://opendata.rapid7.com",
 12 | 
 13 |   "censys_base_url": "https://www.censys.io/api/v1",
 14 |   "censys_api_id": "",
 15 |   "censys_secret": "",
 16 | 
 17 |   "czds_base_url": "https://czds.icann.org",
 18 |   "czds_token": "",
 19 | 
 20 |   "arin_api_key": "",
 21 | 
 22 |   "premiumdrops_username": "",
 23 |   "premiumdrops_password": "",
 24 |   "premiumdrops_urls" : [
 25 |     "https://www.premiumdrops.com/list.php?a=request_full_zone&f=biz",
 26 |     "https://www.premiumdrops.com/list.php?a=request_full_zone&f=com",
 27 |     "https://www.premiumdrops.com/list.php?a=request_full_zone&f=info",
 28 |     "https://www.premiumdrops.com/list.php?a=request_full_zone&f=mobi",
 29 |     "https://www.premiumdrops.com/list.php?a=request_full_zone&f=net",
 30 |     "https://www.premiumdrops.com/list.php?a=request_full_zone&f=org",
 31 |     "https://www.premiumdrops.com/list.php?a=request_full_zone&f=sk",
 32 |     "https://www.premiumdrops.com/list.php?a=request_full_zone&f=us",
 33 |     "https://www.premiumdrops.com/list.php?a=request_full_zone&f=xxx",
 34 |     "https://www.premiumdrops.com/list.php?a=request_zone&f=biz",
 35 |     "https://www.premiumdrops.com/list.php?a=request_zone&f=com",
 36 |     "https://www.premiumdrops.com/list.php?a=request_zone&f=info",
 37 |     "https://www.premiumdrops.com/list.php?a=request_zone&f=mobi",
 38 |     "https://www.premiumdrops.com/list.php?a=request_zone&f=net",
 39 |     "https://www.premiumdrops.com/list.php?a=request_zone&f=org",
 40 |     "https://www.premiumdrops.com/list.php?a=request_zone&f=sk",
 41 |     "https://www.premiumdrops.com/list.php?a=request_zone&f=us",
 42 |     "https://www.premiumdrops.com/list.php?a=request_zone&f=xxx",
 43 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=biz&t=diff",
 44 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=biz&t=new",
 45 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=com&t=diff",
 46 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=com&t=new",
 47 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=info&t=diff",
 48 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=info&t=new",
 49 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=mobi&t=diff",
 50 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=mobi&t=new",
 51 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=net&t=diff",
 52 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=net&t=new",
 53 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=org&t=diff",
 54 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=org&t=new",
 55 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=sk&t=diff",
 56 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=sk&t=new",
 57 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=us&t=diff",
 58 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=us&t=new",
 59 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=xxx&t=diff",
 60 |     "https://www.premiumdrops.com/list.php?a=request_zone_changes&f=xxx&t=new"
 61 |   ],
 62 | 
 63 |   "whoisxmlapi_base_url": "https://bestwhois.org",
 64 |   "whoisxmlapi_username": "",
 65 |   "whoisxmlapi_password": "",
 66 |   "whoisxmlapi_datasets": {
 67 |            "whois1": "/domain_name_data/domain_names_whois/",
 68 |            "whois2": "/domain_name_data/domain_names_whois2/",
 69 |       "gtld_whois1": "/ngtlds_domain_name_data/domain_names_whois/"
 70 |   },
 71 | 
 72 |   "wwwsio_username": "",
 73 |   "wwwsio_password": "",
 74 |   "wwwsio_base_url": "https://wwws.io/api",
 75 | 
 76 |   "caida_prefix2as_ipv4_base_url": "http://data.caida.org/datasets/routing/routeviews-prefix2as",
 77 |   "caida_prefix2as_ipv6_base_url": "http://data.caida.org/datasets/routing/routeviews6-prefix2as",
 78 | 
 79 |   "gov_domains_url": "https://raw.githubusercontent.com/GSA/data/gh-pages/dotgov-domains/current-full.csv",
 80 | 
 81 |   "govuk_domains_base_url": "https://www.gov.uk/government/publications/list-of-gov-uk-domain-names",
 82 | 
 83 |   "rir_delegation_urls" : [
 84 |     "http://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest",
 85 |     "http://ftp.ripe.net/ripe/stats/delegated-ripencc-latest",
 86 |     "http://ftp.ripe.net/ripe/stats/delegated-ripencc-extended-latest",
 87 |     "http://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-latest",
 88 |     "http://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-extended-latest",
 89 |     "http://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-latest",
 90 |     "http://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest",
 91 |     "http://ftp.apnic.net/stats/apnic/legacy-apnic-latest",
 92 |     "http://ftp.apnic.net/stats/apnic/assigned-apnic-latest",
 93 |     "http://ftp.apnic.net/stats/apnic/delegated-apnic-latest",
 94 |     "http://ftp.apnic.net/stats/apnic/delegated-apnic-extended-latest",
 95 |     "http://ftp.apnic.net/stats/apnic/delegated-apnic-ipv6-assigned-latest",
 96 |     "http://ftp.arin.net/pub/originAS/originAS_arin-latest.txt",
 97 |     "http://ftp.arin.net/info/asn.txt"
 98 |   ],
 99 | 
100 |   "rir_database_unused" : [
101 |     "http://ftp.arin.net/pub/rr/ARIN.CURRENTSERIAL",
102 |     "http://ftp.arin.net/pub/rr/arin.db",
103 |     "http://ftp.ripe.net/ripe/dbase/RIPE.CURRENTSERIAL",
104 |     "http://ftp.ripe.net/ripe/dbase/ripe.db.gz",
105 |     "http://ftp.apnic.net/apnic/dbase/data/JPNIC.CURRENTSERIAL",
106 |     "http://ftp.apnic.net/apnic/dbase/data/KRNIC.CURRENTSERIAL",
107 |     "http://ftp.apnic.net/apnic/dbase/data/TWNIC.CURRENTSERIAL",
108 |     "http://ftp.apnic.net/apnic/dbase/data/jpnic.db.gz",
109 |     "http://ftp.apnic.net/apnic/dbase/data/krnic.db.gz",
110 |     "http://ftp.apnic.net/apnic/dbase/data/twnic.in.gz",
111 |     "http://ftp.apnic.net/apnic/dbase/data/twnic.pn.gz",
112 |     "http://ftp.afrinic.net/dbase/AFRINIC.CURRENTSERIAL",
113 |     "http://ftp.afrinic.net/dbase/afrinic.db.gz"
114 |   ],
115 | 
116 |   "ct_logs": [
117 |     "ct.googleapis.com/logs/argon2017/",
118 |     "ct.googleapis.com/logs/argon2018/",
119 |     "ct.googleapis.com/logs/argon2019/",
120 |     "ct.googleapis.com/logs/argon2020/",
121 |     "ct.googleapis.com/logs/argon2021/",
122 |     "ct.googleapis.com/logs/argon2022/",
123 |     "ct.googleapis.com/logs/xenon2018/",
124 |     "ct.googleapis.com/logs/xenon2019/",
125 |     "ct.googleapis.com/logs/xenon2020/",
126 |     "ct.googleapis.com/logs/xenon2021/",
127 |     "ct.googleapis.com/logs/xenon2022/",
128 |     "ct.googleapis.com/aviator/",
129 |     "ct.googleapis.com/icarus/",
130 |     "ct.googleapis.com/pilot/",
131 |     "ct.googleapis.com/rocketeer/",
132 |     "ct.googleapis.com/skydiver/",
133 |     "ct.googleapis.com/submariner/",
134 |     "ct.googleapis.com/daedalus/",
135 |     "ct.googleapis.com/testtube/",
136 |     "ct.googleapis.com/logs/crucible/",
137 |     "ct.googleapis.com/logs/solera2018/",
138 |     "ct.googleapis.com/logs/solera2019/",
139 |     "ct.googleapis.com/logs/solera2020/",
140 |     "ct.googleapis.com/logs/solera2021/",
141 |     "ct.googleapis.com/logs/solera2022/",
142 |     "ct.cloudflare.com/logs/nimbus2017/",
143 |     "ct.cloudflare.com/logs/nimbus2018/",
144 |     "ct.cloudflare.com/logs/nimbus2019/",
145 |     "ct.cloudflare.com/logs/nimbus2020/",
146 |     "ct.cloudflare.com/logs/nimbus2021/",
147 |     "ct1.digicert-ct.com/log/",
148 |     "ct2.digicert-ct.com/log/",
149 |     "yeti2018.ct.digicert.com/log/",
150 |     "yeti2019.ct.digicert.com/log/",
151 |     "yeti2020.ct.digicert.com/log/",
152 |     "yeti2021.ct.digicert.com/log/",
153 |     "yeti2022.ct.digicert.com/log/",
154 |     "nessie2018.ct.digicert.com/log/",
155 |     "nessie2019.ct.digicert.com/log/",
156 |     "nessie2020.ct.digicert.com/log/",
157 |     "nessie2021.ct.digicert.com/log/",
158 |     "nessie2022.ct.digicert.com/log/",
159 |     "ct.ws.symantec.com/",
160 |     "vega.ws.symantec.com/",
161 |     "deneb.ws.symantec.com/",
162 |     "sirius.ws.symantec.com/",
163 |     "log.certly.io/",
164 |     "ct.izenpe.com/",
165 |     "ct.izenpe.eus/",
166 |     "ct.wosign.com/",
167 |     "ctlog.wosign.com/",
168 |     "ctlog2.wosign.com/",
169 |     "ct.gdca.com.cn/",
170 |     "ctlog.gdca.com.cn/",
171 |     "log.gdca.com.cn/",
172 |     "log2.gdca.com.cn/",
173 |     "dodo.ct.comodo.com/",
174 |     "ctlog.api.venafi.com/",
175 |     "ctlog-gen2.api.venafi.com/",
176 |     "ctserver.cnnic.cn/",
177 |     "ct.startssl.com/",
178 |     "www.certificatetransparency.cn/ct/",
179 |     "sabre.ct.comodo.com/",
180 |     "mammoth.ct.comodo.com/",
181 |     "flimsy.ct.nordu.net:8080/",
182 |     "plausible.ct.nordu.net/",
183 |     "ctlog.sheca.com/",
184 |     "ct.sheca.com/",
185 |     "ct.akamai.com/",
186 |     "alpha.ctlogs.org/",
187 |     "clicky.ct.letsencrypt.org/",
188 |     "ct.filippo.io/behindthesofa/"
189 |   ]
190 | }
191 | 


--------------------------------------------------------------------------------
/lib/inetdata.rb:
--------------------------------------------------------------------------------
 1 | require 'fileutils'
 2 | require 'net/http'
 3 | require 'net/https'
 4 | require 'ipaddr'
 5 | require 'cgi'
 6 | require 'uri'
 7 | require 'json'
 8 | require 'time'
 9 | require 'shellwords'
10 | 
11 | require 'inetdata/config'
12 | require 'inetdata/logger'
13 | require 'inetdata/source'
14 | 
15 | module InetData
16 |   VERSION = "1.2.2"
17 | end
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/lib/inetdata/config.rb:
--------------------------------------------------------------------------------
 1 | module InetData
 2 |   class Config < Hash
 3 | 
 4 |     def initialize(path=nil)
 5 |       root = File.expand_path(File.join(File.dirname(__FILE__), "..", ".."))
 6 |       unless path
 7 |         path = File.join(root, "conf", "inetdata.json")
 8 |       end
 9 | 
10 |       unless File.exists?(path) && File.readable?(path)
11 |         raise RuntimeError, "Missing configuration file: #{path}"
12 |       end
13 | 
14 |       self.merge!(JSON.parse(File.read(path)))
15 |       self['root'] = root
16 | 
17 |       %W{ storage logs reports }.each do |k|
18 |         unless self[k].to_s.length > 0
19 |           raise RuntimeError, "Missing configuration path for #{k}"
20 |         end
21 |         self[k] = File.expand_path(self[k].gsub(/^\.\//, self['root'] + '/'))
22 |       end
23 |     end
24 | 
25 |     def self.raise_rlimit_nofiles(nofiles)
26 |       Process.setrlimit(Process::RLIMIT_NOFILE, nofiles)
27 |       Process.getrlimit(Process::RLIMIT_NOFILE).first
28 |     end
29 | 
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/lib/inetdata/logger.rb:
--------------------------------------------------------------------------------
 1 | module InetData
 2 |   class Logger
 3 |     attr_accessor :config, :lname, :fd, :lock
 4 | 
 5 |     def initialize(config, lname)
 6 |       self.config = config
 7 |       self.lname = lname.gsub(/[^a-z0-9A-F_\-]+/, '')
 8 |       self.config[:logger] = self
 9 |       self.lock = Mutex.new
10 |     end
11 | 
12 |     def log(msg)
13 |       self.lock.synchronize do
14 |         entry = "#{Time.now.strftime("%Y-%m-%d %H:%M:%S")} [#{lname}] #{msg}"
15 |         if config['log_stderr']
16 |           $stderr.puts entry
17 |           $stderr.flush
18 |         end
19 | 
20 |         unless self.fd
21 |           FileUtils.mkdir_p(config['logs'])
22 |           self.fd = File.open(File.join(config['logs'], self.lname + ".txt"), "wb")
23 |         end
24 | 
25 |         self.fd.puts(entry)
26 |         self.fd.flush
27 |       end
28 |     end
29 | 
30 |     def dlog(msg)
31 |       return unless config['log_debug']
32 |       log("DEBUG: #{msg}")
33 |     end
34 | 
35 |   end
36 | end
37 | 


--------------------------------------------------------------------------------
/lib/inetdata/source.rb:
--------------------------------------------------------------------------------
 1 | require 'inetdata/source/base'
 2 | require 'inetdata/source/premiumdrops'
 3 | require 'inetdata/source/whoisxmlapi'
 4 | require 'inetdata/source/sonar'
 5 | require 'inetdata/source/censys_ipv4'
 6 | require 'inetdata/source/censys_certs'
 7 | require 'inetdata/source/czds'
 8 | require 'inetdata/source/arin'
 9 | require 'inetdata/source/rir'
10 | require 'inetdata/source/caida_prefix2as'
11 | require 'inetdata/source/gov'
12 | require 'inetdata/source/govuk'
13 | require 'inetdata/source/wwwsio'
14 | require 'inetdata/source/ct'
15 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/arin.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 | 
  3 |   module Source
  4 |     class ARIN < Base
  5 | 
  6 |       def available?
  7 |         config['arin_api_key'].to_s.length > 0
  8 |       end
  9 | 
 10 |       def download_file(src, dst)
 11 |         tmp    = dst + ".tmp"
 12 |         target = URI.parse(src)
 13 |         size   = 0
 14 |         ims    = false
 15 |         http   = Net::HTTP.new(target.host, target.port)
 16 |         http.use_ssl = true
 17 | 
 18 |         req = Net::HTTP::Get.new(target.request_uri)
 19 | 
 20 |         if File.exists?(dst)
 21 |           req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
 22 |           ims = true
 23 |         end
 24 | 
 25 |         # Short-circuit the download if the local file exists due to the number of files
 26 |         if ims
 27 |           log(" > Skipped downloading of #{dst} due to existing file on disk")
 28 |           return true
 29 |         end
 30 | 
 31 |         http.request(req) do |res|
 32 | 
 33 |           if ims && res.code.to_i == 304
 34 |             log(" > Skipped downloading of #{dst} due to not modified response")
 35 |             return true
 36 |           end
 37 | 
 38 |           if ims && res['Content-Length']
 39 |             if res['Content-Length'].to_i == File.size(dst)
 40 |               log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
 41 |               return true
 42 |             end
 43 |           end
 44 | 
 45 |           if res.code.to_i != 200
 46 |             log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message}")
 47 |             return true
 48 |           end
 49 | 
 50 |           outp = File.open(tmp, "wb")
 51 | 
 52 |           res.read_body do |chunk|
 53 |             outp.write(chunk)
 54 |             size += chunk.length
 55 |           end
 56 | 
 57 |           outp.close
 58 |         end
 59 | 
 60 |         File.rename(tmp, dst)
 61 | 
 62 |         log(" > Downloading of #{dst} completed with #{size} bytes")
 63 |       end
 64 | 
 65 |       def download
 66 |         date = Time.now.strftime("%Y%m%d")
 67 |         dir  = File.expand_path(File.join(storage_path, date))
 68 |         FileUtils.mkdir_p(dir)
 69 | 
 70 | 	      %W{ nets asns orgs pocs }.each do |ftype|
 71 |           name   = "#{ftype}.xml"
 72 |           url    = "https://www.arin.net/public/secure/downloads/bulkwhois/#{name}?apikey=" + config['arin_api_key']
 73 |           dst    = File.join(dir, name)
 74 |           dst_gz = dst + ".gz"
 75 |           tmp    = dst + ".tmp"
 76 |           tmp_gz = "#{tmp}.gz"
 77 | 
 78 |           if File.exists?(dst_gz)
 79 |             log("File already exists, skipping: #{dst_gz}")
 80 |             next
 81 |           end
 82 | 
 83 |           log("Dowloading #{dst}")
 84 |           download_file(url, tmp)
 85 |           cmd = "nice pigz #{tmp}"
 86 |           log("Running #{cmd}\n")
 87 |           system(cmd)
 88 |           File.rename(tmp_gz, dst_gz)
 89 |         end
 90 |       end
 91 | 
 92 |       #
 93 |       # Normalize the latest ARIN data
 94 |       #
 95 |       def normalize
 96 |         data = latest_data
 97 |         norm = File.join(data, "normalized")
 98 |         FileUtils.mkdir_p(norm)
 99 | 
100 |         if File.exists?(File.join(norm, "_normalized_"))
101 |           log("Normalized data is already present for #{data}")
102 |           return true
103 |         end
104 | 
105 |         unless inetdata_parsers_available?
106 |           log("The inetdata-parsers tools are not in the execution path, aborting normalization")
107 |           return false
108 |         end
109 | 
110 |         %W{ nets asns orgs pocs }.each do |ftype|
111 |           cmd = "nice pigz -dc #{data}/#{ftype}.xml.gz | nice inetdata-arin-xml2json /dev/stdin | nice pigz -c > #{norm}/#{ftype}.json.gz"
112 |           log("Running #{cmd}\n")
113 |           system(cmd)
114 |         end
115 | 
116 |         %W{ nets asns orgs pocs }.each do |ftype|
117 |           cmd = "nice pigz -dc #{data}/#{ftype}.xml.gz | nice inetdata-arin-xml2csv /dev/stdin | nice pigz -c > #{norm}/#{ftype}.csv.gz"
118 |           log("Running #{cmd}\n")
119 |           system(cmd)
120 |         end
121 | 
122 |         File.open(File.join(norm, "_normalized_"), "wb") {|fd|}
123 |       end
124 | 
125 |       #
126 |       # Find the most recent dataset
127 |       #
128 |       def latest_data
129 |         path = Dir["#{storage_path}/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]"].
130 |                sort{|a,b| b.split("/")[-1].to_i <=> a.split("/")[-1].to_i}.
131 |                first
132 | 
133 |         if not path
134 |           raise RuntimeError, "No dataset available for #{self.name}"
135 |         end
136 | 
137 |         path
138 |       end
139 | 
140 |     end
141 |   end
142 | end
143 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/base.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class Base
  4 | 
  5 |       class NotImplemented < ::RuntimeError
  6 |       end
  7 | 
  8 |       @@have_inetdata_parsers = nil
  9 | 
 10 |       VALID_HOSTNAME = /^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])(\.([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9]))*$/
 11 |       MATCH_IPV6 = /^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$/
 12 |       MATCH_IPV4 = /^\s*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2}))\s*$/
 13 |       MATCH_IPV4_PRIVATE = /^\s*(?:10\.|192\.168|172.(?:1[6-9]|2[0-9]|3[01])\.|169\.254)/
 14 | 
 15 |       attr_accessor :config
 16 | 
 17 |       def self.name
 18 |         self.to_s.split('::').last.downcase
 19 |       end
 20 | 
 21 |       def name
 22 |         self.class.name
 23 |       end
 24 | 
 25 |       def initialize(config)
 26 |         self.config = config
 27 |       end
 28 | 
 29 |       def max_tries
 30 |         5
 31 |       end
 32 | 
 33 |       def log(msg)
 34 |         config[:logger].log("[#{self.name}] #{msg}")
 35 |       end
 36 | 
 37 |       def dlog(msg)
 38 |         config[:logger].dlog("[#{self.name}] #{msg}")
 39 |       end
 40 | 
 41 |       def fail(reason)
 42 |         log("ERROR: #{reason}")
 43 | 	      raise RuntimeError, "[#{self.name}] FATAL ERROR: #{reason}"
 44 |       end
 45 | 
 46 |       def available?
 47 |         true
 48 |       end
 49 | 
 50 |       def manual?
 51 |         false
 52 |       end
 53 | 
 54 |       def storage_path
 55 |         File.expand_path(File.join(config['storage'], self.name))
 56 |       end
 57 | 
 58 |       def reports_path
 59 |         File.expand_path(File.join(config['reports'], self.name))
 60 |       end
 61 | 
 62 |       def download
 63 |         raise NotImplemented
 64 |       end
 65 | 
 66 |       def normalize
 67 |         raise NotImplemented
 68 |       end
 69 | 
 70 |       def validate_domain(dname)
 71 |         return false unless dname =~ VALID_HOSTNAME
 72 |         return false unless dname.index(".")
 73 |         dname.sub(/\.$/, '')
 74 |       end
 75 | 
 76 |       def inetdata_parsers_available?
 77 |         return @@have_inetdata_parsers unless @@have_inetdata_parsers.nil?
 78 |         utils = %W{
 79 |             inetdata-arin-org2cidrs inetdata-csv2mtbl inetdata-csvrollup inetdata-csvsplit inetdata-dns2mtbl
 80 |             inetdata-hostnames2domains inetdata-json2mtbl inetdata-lines2mtbl inetdata-zone2csv inetdata-arin-xml2json
 81 |             inetdata-arin-xml2csv inetdata-ct2mtbl mq inetdata-sonardnsv2-split
 82 |         }
 83 |         utils.each do |name|
 84 |           unless `which #{name}`.length > 0
 85 |             @@have_inetdata_parsers = false
 86 |             dlog("Missing inetdata-parsers command: #{name}")
 87 |             return
 88 |           end
 89 |         end
 90 |         @@have_inetdata_parsers = true
 91 |         return @@have_inetdata_parsers
 92 |       end
 93 | 
 94 |       def gzip_command
 95 |         @gzip_command ||= (`which pigz`.length > 0) ? "pigz" : "gzip"
 96 |       end
 97 | 
 98 |       def decompress_gzfile(path)
 99 |         cmd = [gzip_command, "-dc"]
100 |         cmd.push(path)
101 | 
102 |         dlog("Decompressing #{path} with #{cmd} for #{self.name}...")
103 |         if block_given?
104 |           IO.popen(cmd, "rb") do |pipe|
105 |             yield(pipe)
106 |           end
107 |         else
108 |           return IO.popen(cmd)
109 |         end
110 |       end
111 | 
112 |       def expand_domains(hostname)
113 |         return [] if hostname =~ MATCH_IPV4
114 | 
115 |         bits = hostname.split('.').select{|x| x.length > 0}
116 |         outp = []
117 |         bits.shift
118 | 
119 |         while bits.length > 1
120 |           outp << bits.join(".")
121 |           bits.shift
122 |         end
123 | 
124 |         outp
125 |       end
126 | 
127 |       def uniq_sort_file(path, keep=false)
128 |         pre = "LC_ALL=C nice sort #{get_sort_options} -u "
129 |         dst = path + ".sorted"
130 |         err = path + ".sort.err"
131 |         old = path + ".unsorted"
132 | 
133 |         cmd = ("#{pre} #{Shellwords.shellescape(path)} >#{Shellwords.shellescape(dst)} 2>#{Shellwords.shellescape(err)}")
134 |         dlog("Unique sorting #{path} for #{self.name}")
135 |         ok  = system(cmd)
136 | 
137 |         unless ok
138 |           raise RuntimeError.new("Unique sort of #{path} triggered an error, stored in #{err}")
139 |         end
140 | 
141 |         if keep
142 |           File.rename(path, old)
143 |         end
144 | 
145 |         if File.exists?(err) && File.size(err) == 0
146 |           File.unlink(err)
147 |         end
148 | 
149 |         File.rename(dst, path)
150 |         true
151 |       end
152 | 
153 |       def get_tempdir
154 |         ENV['HOME'] || "/tmp"
155 |       end
156 | 
157 |       def get_sort_options
158 |         "-S #{get_max_ram_sort} --parallel=#{get_max_cores}"
159 |       end
160 | 
161 |       def get_max_ram_sort
162 |         config['max_ram'] || '50%'
163 |       end
164 | 
165 |       def get_total_ram
166 |         @max_total_ram ||= `free -g | grep ^Mem`.split(/\s+/)[1].to_i
167 |       end
168 | 
169 |       def get_max_cores
170 |         config['max_cores'] || File.read("/proc/cpuinfo").scan(/^processor\s+:/).length
171 |       end
172 | 
173 |     end
174 |   end
175 | end
176 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/caida_prefix2as.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class CAIDA_Prefix2AS < Base
  4 | 
  5 |       def download_file(src, dst)
  6 |         tmp   = dst + ".tmp"
  7 |         ims   = false
  8 |         tries = 0
  9 | 
 10 |         begin
 11 |           tries += 1
 12 |           target = URI.parse(src)
 13 |           size   = 0
 14 |           csize  = nil
 15 | 
 16 |           http = Net::HTTP.new(target.host, target.port)
 17 | 
 18 |           # Invalid SSL certificate as of 12/1/2016
 19 |           if src.index("https") == 0
 20 |             http.use_ssl = true
 21 |             http.verify_mode = OpenSSL::SSL::VERIFY_NONE
 22 |           end
 23 | 
 24 |           req = Net::HTTP::Get.new(target.request_uri)
 25 | 
 26 |           if File.exists?(dst)
 27 |             req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
 28 |             ims = true
 29 |           end
 30 | 
 31 |           http.request(req) do |res|
 32 | 
 33 |             if ims && res.code.to_i == 304
 34 |               log(" > Skipped downloading of #{dst} due to not modified response")
 35 |               return true
 36 |             end
 37 | 
 38 |             if ims && res['Content-Length']
 39 |               if res['Content-Length'].to_i == File.size(dst)
 40 |                 log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
 41 |                 return true
 42 |               end
 43 |             end
 44 | 
 45 |             if res.code.to_i >= 500 && res.code.to_i < 600
 46 |               raise RuntimeError, "Server Error: #{res.code} #{res.message}"
 47 |             end
 48 | 
 49 |             if res.code.to_i != 200
 50 |               log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message}")
 51 |               return true
 52 |             end
 53 | 
 54 |             log("Download started from #{src} to #{dst}")
 55 |             outp = File.open(tmp, "wb")
 56 |             res.read_body do |chunk|
 57 |               outp.write(chunk)
 58 |               size += chunk.length
 59 |             end
 60 |             outp.close
 61 |           end
 62 | 
 63 |           File.rename(tmp, dst)
 64 | 
 65 |         rescue ::Interrupt
 66 |           raise $!
 67 |         rescue ::Exception
 68 |           if tries < self.max_tries
 69 |             log("Download failed: #{src} -> #{dst} : #{$!.class} #{$!}, retrying...")
 70 |             sleep(30)
 71 |             retry
 72 |           else
 73 |             fail("Download failed: #{src} -> #{dst} : #{$!.class} #{$!} after #{tries} attempts")
 74 |           end
 75 |         end
 76 |         log("Download completed from #{src} to #{dst}")
 77 |       end
 78 | 
 79 |       def download_index(url)
 80 |         target = URI.parse(url)
 81 | 
 82 |         tries = 0
 83 |         begin
 84 | 
 85 |           tries += 1
 86 |           http = Net::HTTP.new(target.host, target.port)
 87 | 
 88 |           # Invalid SSL certificate as of 12/1/2016
 89 |           if url.index("https") == 0
 90 |             http.use_ssl = true
 91 |             http.verify_mode = OpenSSL::SSL::VERIFY_NONE
 92 |           end
 93 | 
 94 |           req = Net::HTTP::Get.new(target.request_uri)
 95 |           res = http.request(req)
 96 | 
 97 |           unless (res and res.code.to_i == 200 and res.body.to_s.index("Index of /"))
 98 |             if res
 99 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
100 |             else
101 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
102 |             end
103 |           end
104 | 
105 |           res.body.to_s.scan(/a href=\"(routeviews[^\"]+)\"/m).map{|m| m.first}
106 | 
107 |         rescue ::Interrupt
108 |           raise $!
109 |         rescue ::Exception
110 |           if tries < self.max_tries
111 |             log("Index download failed: #{url} #{$!.class} #{$!}, #{$!.backtrace} retrying...")
112 |             sleep(30)
113 |             retry
114 |           else
115 |             fail("Index download failed: #{url} #{$!.class} #{$!} after #{tries} attempts")
116 |           end
117 |         end
118 |       end
119 | 
120 |       def download
121 |         path = Time.now.strftime("/%Y/%m/")
122 |         ipv4 = config['caida_prefix2as_ipv4_base_url'] + path
123 |         ipv6 = config['caida_prefix2as_ipv6_base_url'] + path
124 | 
125 |         [ipv4, ipv6].each do |rindex|
126 |           download_index(rindex).each do |item|
127 |             url  = rindex + item
128 |             targ = URI.parse(url)
129 |             file = targ.path.split("/").last
130 |             date = Time.now.strftime("%Y%m")
131 |             dir  = File.expand_path(File.join(storage_path, date))
132 |             dst  = File.join(dir, file)
133 |             FileUtils.mkdir_p(dir)
134 |             download_file(url, dst)
135 |           end
136 |         end
137 |       end
138 | 
139 |     end
140 |   end
141 | end
142 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/censys_certs.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class Censys_Certs < Base
  4 | 
  5 |       def manual?
  6 |         true
  7 |       end
  8 | 
  9 |       def available?
 10 |         config['censys_api_id'].to_s.length > 0 &&
 11 |         config['censys_secret'].to_s.length > 0
 12 |       end
 13 | 
 14 |       def download_file(src, dst)
 15 |         tmp    = dst + ".tmp"
 16 |         target = URI.parse(src)
 17 |         size   = 0
 18 |         ims    = false
 19 |         http   = Net::HTTP.new(target.host, target.port)
 20 | 
 21 |         if src.index("https") == 0
 22 |           http.use_ssl = true
 23 |         end
 24 | 
 25 |         req = Net::HTTP::Get.new(target.request_uri)
 26 | 
 27 |         if File.exists?(dst)
 28 |           req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
 29 |           ims = true
 30 |         end
 31 | 
 32 |         http.request(req) do |res|
 33 | 
 34 |           if ims && res.code.to_i == 304
 35 |             log(" > Skipped downloading of #{dst} due to not modified response")
 36 |             return true
 37 |           end
 38 | 
 39 |           if ims && res['Content-Length']
 40 |             if res['Content-Length'].to_i == File.size(dst)
 41 |               log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
 42 |               return true
 43 |             end
 44 |           end
 45 | 
 46 |           if res.code.to_i != 200
 47 |             log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message}")
 48 |             return true
 49 |           end
 50 | 
 51 |           log("Downloading #{src} with #{res['Content-Length']} bytes to #{dst}...")
 52 |           outp = File.open(tmp, "wb")
 53 |           res.read_body do |chunk|
 54 |             outp.write(chunk)
 55 |             size += chunk.length
 56 |           end
 57 | 
 58 |           outp.close
 59 |         end
 60 | 
 61 |         File.rename(tmp, dst)
 62 | 
 63 |         log(" > Downloading of #{dst} completed with #{size} bytes")
 64 |       end
 65 | 
 66 |       def query_json(src)
 67 |         target = URI.parse(src)
 68 |         tries  = 0
 69 | 
 70 |         begin
 71 |           tries += 1
 72 |           http   = Net::HTTP.new(target.host, target.port)
 73 |           http.use_ssl = true
 74 | 
 75 |           req = Net::HTTP::Get.new(target.request_uri)
 76 |           req.basic_auth(config['censys_api_id'], config['censys_secret'])
 77 | 
 78 |           res = http.request(req)
 79 | 
 80 |           unless (res and res.code.to_i == 200 and res['Content-Type'].index('application/json'))
 81 |             if res
 82 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 83 |             else
 84 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 85 |             end
 86 |           end
 87 | 
 88 |           return JSON.parse(res.body)
 89 | 
 90 |         rescue ::Interrupt
 91 |           raise $!
 92 |         rescue ::Exception
 93 |           if tries < self.max_tries
 94 |             log("Query of #{path} failed: #{$!.class} #{$!}, retrying...")
 95 |             sleep(30)
 96 |             retry
 97 |           else
 98 |             fail("Query of #{path} failed: #{$!.class} #{$!} after #{tries} attempts")
 99 |           end
100 |         end
101 |       end
102 | 
103 |       def download
104 |         meta = query_json(config['censys_base_url'] + '/data')
105 |         dir  = storage_path
106 |         FileUtils.mkdir_p(dir)
107 | 
108 |         mqueue = []
109 |         dqueue = {}
110 |         if dbase = meta['primary_series']
111 |           ['All X.509 Certificates'].each do |dtype|
112 |             if dbase[dtype] &&
113 |                dbase[dtype]['latest_result'] &&
114 |                dbase[dtype]['latest_result']['details_url']
115 |               mqueue << dbase[dtype]['latest_result']['details_url']
116 |             end
117 |           end
118 |         end
119 | 
120 |         mqueue.each do |mpath|
121 |           info = query_json(mpath)
122 | 
123 |           if info &&
124 |             info['series'] &&
125 |             info['series']['id'] &&
126 |             info['primary_file'] &&
127 |             info['primary_file']['compressed_download_path'] &&
128 |             info['timestamp']
129 | 
130 |             fname = ( [ info['series']['id'], info['timestamp'].to_s ].join("-") +
131 |                       ".json." + info['primary_file']['compressed_download_path'].split('.').last
132 |                     ).gsub("/", "_")
133 | 
134 |             dst = File.join(dir, fname)
135 |             dqueue[info['primary_file']['compressed_download_path']] = dst
136 |           end
137 |         end
138 | 
139 |         dqueue.each_pair do |src,dst|
140 |           download_file(src, dst)
141 |         end
142 |       end
143 | 
144 |     end
145 |   end
146 | end
147 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/censys_ipv4.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class Censys_IPv4 < Base
  4 | 
  5 |       def manual?
  6 |         true
  7 |       end
  8 | 
  9 |       def available?
 10 |         config['censys_api_id'].to_s.length > 0 &&
 11 |         config['censys_secret'].to_s.length > 0
 12 |       end
 13 | 
 14 |       def download_file(src, dst)
 15 |         tmp    = dst + ".tmp"
 16 |         target = URI.parse(src)
 17 |         size   = 0
 18 |         ims    = false
 19 |         http   = Net::HTTP.new(target.host, target.port)
 20 | 
 21 |         if src.index("https") == 0
 22 |           http.use_ssl = true
 23 |         end
 24 | 
 25 |         req = Net::HTTP::Get.new(target.request_uri)
 26 | 
 27 |         if File.exists?(dst)
 28 |           req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
 29 |           ims = true
 30 |         end
 31 | 
 32 |         http.request(req) do |res|
 33 | 
 34 |           if ims && res.code.to_i == 304
 35 |             log(" > Skipped downloading of #{dst} due to not modified response")
 36 |             return true
 37 |           end
 38 | 
 39 |           if ims && res['Content-Length']
 40 |             if res['Content-Length'].to_i == File.size(dst)
 41 |               log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
 42 |               return true
 43 |             end
 44 |           end
 45 | 
 46 |           if res.code.to_i != 200
 47 |             log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message}")
 48 |             return true
 49 |           end
 50 | 
 51 |           log("Downloading #{src} with #{res['Content-Length']} bytes to #{dst}...")
 52 |           outp = File.open(tmp, "wb")
 53 |           res.read_body do |chunk|
 54 |             outp.write(chunk)
 55 |             size += chunk.length
 56 |           end
 57 | 
 58 |           outp.close
 59 |         end
 60 | 
 61 |         File.rename(tmp, dst)
 62 | 
 63 |         log(" > Downloading of #{dst} completed with #{size} bytes")
 64 |       end
 65 | 
 66 |       def query_json(src)
 67 |         target = URI.parse(src)
 68 |         tries  = 0
 69 | 
 70 |         begin
 71 |           tries += 1
 72 |           http   = Net::HTTP.new(target.host, target.port)
 73 |           http.use_ssl = true
 74 | 
 75 |           req = Net::HTTP::Get.new(target.request_uri)
 76 |           req.basic_auth(config['censys_api_id'], config['censys_secret'])
 77 | 
 78 |           res = http.request(req)
 79 | 
 80 |           unless (res and res.code.to_i == 200 and res['Content-Type'].index('application/json'))
 81 |             if res
 82 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 83 |             else
 84 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 85 |             end
 86 |           end
 87 | 
 88 |           return JSON.parse(res.body)
 89 | 
 90 |         rescue ::Interrupt
 91 |           raise $!
 92 |         rescue ::Exception
 93 |           if tries < self.max_tries
 94 |             log("Query of #{path} failed: #{$!.class} #{$!}, retrying...")
 95 |             sleep(30)
 96 |             retry
 97 |           else
 98 |             fail("Query of #{path} failed: #{$!.class} #{$!} after #{tries} attempts")
 99 |           end
100 |         end
101 |       end
102 | 
103 |       def download
104 |         meta = query_json(config['censys_base_url'] + '/data')
105 |         dir  = storage_path
106 |         FileUtils.mkdir_p(dir)
107 | 
108 |         mqueue = []
109 |         dqueue = {}
110 |         if dbase = meta['primary_series']
111 |           ['IPv4 Snapshots'].each do |dtype|
112 |             if dbase[dtype] &&
113 |                dbase[dtype]['latest_result'] &&
114 |                dbase[dtype]['latest_result']['details_url']
115 |               mqueue << dbase[dtype]['latest_result']['details_url']
116 |             end
117 |           end
118 |         end
119 | 
120 |         mqueue.each do |mpath|
121 |           info = query_json(mpath)
122 | 
123 |           if info &&
124 |             info['series'] &&
125 |             info['series']['id'] &&
126 |             info['primary_file'] &&
127 |             info['primary_file']['compressed_download_path'] &&
128 |             info['timestamp']
129 | 
130 |             fname = ( [ info['series']['id'], info['timestamp'].to_s ].join("-") +
131 |                       ".json." + info['primary_file']['compressed_download_path'].split('.').last
132 |                     ).gsub("/", "_")
133 | 
134 |             dst = File.join(dir, fname)
135 |             dqueue[info['primary_file']['compressed_download_path']] = dst
136 |           end
137 |         end
138 | 
139 |         dqueue.each_pair do |src,dst|
140 |           download_file(src, dst)
141 |         end
142 |       end
143 | 
144 |       def normalize
145 |         data = storage_path
146 |         norm = File.join(data, "normalized")
147 |         FileUtils.mkdir_p(norm)
148 | 
149 |         unless inetdata_parsers_available?
150 |           log("The inetdata-parsers tools are not in the execution path, aborting normalization")
151 |           return false
152 |         end
153 | 
154 |         src = latest_data
155 |         unless src
156 |           log("Error: no dataset is available")
157 |           return
158 |         end
159 | 
160 |         dst = src.sub(/\.json\.lz4$/, '.mtbl')
161 |         if File.exists?(dst)
162 |           log("Data file #{src} is already normalized at #{dst}")
163 |           return
164 |         end
165 |         dst_tmp = dst + ".tmp"
166 | 
167 |         if `which lz4cat`.to_s.length == 0
168 |           log("Error: the 'lz4cat' binary is not available")
169 |           return
170 |         end
171 | 
172 |         mtbl_cmd = "nice lz4cat -dc #{Shellwords.shellescape(src)} | " +
173 |                    "nice inetdata-json2mtbl -k ip -t #{get_tempdir} -m #{(get_total_ram/8.0).to_i} #{Shellwords.shellescape(dst_tmp)}"
174 |         log("Running #{mtbl_cmd}")
175 |         system(mtbl_cmd)
176 |         File.rename(dst_tmp, dst)
177 |       end
178 | 
179 |       #
180 |       # Find the most recent dataset
181 |       #
182 |       def latest_data
183 |         path = Dir["#{storage_path}/ipv4-*.json.lz4"].sort { |a,b|
184 |           File.basename(b).sub(/.*ipv4-(\d+)T.*/){|x| $1 }.to_i <=>
185 |           File.basename(a).sub(/.*ipv4-(\d+)T.*/){|x| $1 }.to_i
186 |         }.first
187 | 
188 |         if not path
189 |           raise RuntimeError, "No IPv4 dataset available for #{self.name}"
190 |         end
191 | 
192 |         path
193 |       end
194 | 
195 |       #
196 |       # Find the most recent normalized dataset
197 |       #
198 |       def latest_normalized_data(dtype)
199 |         path = Dir["#{storage_path}/normalized/ipv4-*.mtbl"].sort { |a,b|
200 |           File.basename(b).sub(/.*ipv4-(\d+)T.*/){|x| $1 }.to_i <=>
201 |           File.basename(a).sub(/.*ipv4-(\d+)T.*/){|x| $1 }.to_i
202 |         }.first
203 | 
204 |         if not path
205 |           raise RuntimeError, "No IPv4 normalized_dataset available for #{self.name}"
206 |         end
207 | 
208 |         path
209 |       end
210 | 
211 |     end
212 |   end
213 | end
214 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/ct.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class CT < Base
  4 | 
  5 |       def manual?
  6 |         true
  7 |       end
  8 | 
  9 |       def ct_request(url)
 10 |         target = URI.parse(url)
 11 |         tries  = 0
 12 |         begin
 13 | 
 14 |           tries += 1
 15 |           http   = Net::HTTP.new(target.host, target.port)
 16 | 
 17 |           if url.index("https") == 0
 18 |             http.use_ssl = true
 19 |           end
 20 | 
 21 |           # Necessary but probably not harmful given how the data is used
 22 |           http.verify_mode = OpenSSL::SSL::VERIFY_NONE
 23 | 
 24 |           req    = Net::HTTP::Get.new(target.request_uri)
 25 |           res    = http.request(req)
 26 | 
 27 |           unless (res and res.code.to_i == 200 and res['Content-Type'].index('application/json'))
 28 |             if res
 29 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 30 |             else
 31 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 32 |             end
 33 |           end
 34 | 
 35 |           return JSON.parse(res.body)
 36 | 
 37 |         rescue ::Interrupt
 38 |           raise $!
 39 |         rescue ::Exception
 40 |           if tries < self.max_tries
 41 |             log("CT request failed: #{url} #{$!.class} #{$!}, retrying...")
 42 |             sleep(30)
 43 |             retry
 44 |           else
 45 |             fail("CT request failed: #{url} #{$!.class} #{$!} after #{tries} attempts")
 46 |           end
 47 |         end
 48 |       end
 49 | 
 50 | 
 51 |       def ct_sync(log_name, log_base)
 52 | 
 53 |         nrecs = 0
 54 |         state = nil
 55 | 
 56 |         meta_file = File.join(storage_path, "#{log_name}_meta.json")
 57 |         if File.exists?(meta_file)
 58 |           state = JSON.parse(File.read(meta_file))
 59 |         end
 60 |         state ||= { 'entries' => 0 }
 61 | 
 62 |         # Data files are in the format of <log>_data_<start-record>.json
 63 |         data_file = File.join(storage_path, "#{log_name}_data_#{state['entries']}.json")
 64 | 
 65 |         sth = ct_request(log_base + '/ct/v1/get-sth')
 66 |         return unless sth and sth['tree_size']
 67 | 
 68 |         if sth['tree_size'] == state['entries']
 69 |           log("#{log_name} is already synchronized with #{state['entries']} entries")
 70 |           return
 71 |         end
 72 | 
 73 |         log("#{log_name} has #{sth['tree_size']} total records available")
 74 | 
 75 |         while state['entries'] < (sth['tree_size'] - 1)
 76 | 
 77 |           entry_beg = state['entries']
 78 |           entry_end = [ state['entries'] + 2000, sth['tree_size'] - 1 ].min
 79 | 
 80 |           get_url = log_base + "/ct/v1/get-entries?start=#{entry_beg}&end=#{entry_end}"
 81 |           data = ct_request(get_url)
 82 |           if not (data && data['entries'])
 83 |             fail("#{log_name} returned bad data: #{data.inspect}")
 84 |             return
 85 |           end
 86 | 
 87 |           # Write the CT response data
 88 |           File.open(data_file, "ab") do |fd|
 89 |             data['entries'].each do |entry|
 90 |               fd.puts(entry.to_json)
 91 |             end
 92 |           end
 93 | 
 94 |           state['entries'] += data['entries'].length
 95 |           nrecs += data['entries'].length
 96 | 
 97 |           # Update the meta file
 98 |           File.open(meta_file, "w") do |fd|
 99 |             fd.puts(state.to_json)
100 |           end
101 | 
102 |           log("#{log_name} downloaded #{state['entries']}/#{sth['tree_size']} records")
103 |         end
104 | 
105 |         # Compress the data file if new records were downloaded
106 |         if nrecs > 0
107 |           log("#{log_name} compressing data file containing #{nrecs} records: #{data_file}")
108 |           system("nice #{gzip_command} #{Shellwords.shellescape(data_file)}")
109 |         end
110 | 
111 |         log("#{log_name} synchronized with #{nrecs} new entries (#{state['entries']} total)")
112 |       end
113 | 
114 |       def download
115 |         dir  = storage_path
116 |         FileUtils.mkdir_p(dir)
117 | 
118 |         ct_logs = config['ct_logs']
119 | 
120 |         ct_threads = []
121 |         ct_logs.each do |log_base|
122 |           # Trim the trailing slash from log_base
123 |           log_base.gsub!(/\/+$/, '')
124 | 
125 |           # Determine the log name from the url
126 |           log_name = log_base.gsub("/", "_")
127 | 
128 |           ct_threads << Thread.new(log_name, log_base) do |lname,lbase|
129 |             begin
130 |               ct_sync(lname, "https://" + lbase)
131 |             rescue ::Exception => e
132 |               log("#{lname} failed to sync: #{e} #{e.backtrace}")
133 |             end
134 |           end
135 |         end
136 | 
137 |         ct_threads.each {|t| t.join }
138 |       end
139 | 
140 |       def normalize
141 |         data = storage_path
142 |         norm = File.join(data, "normalized")
143 |         FileUtils.mkdir_p(norm)
144 | 
145 |         unless inetdata_parsers_available?
146 |           log("The inetdata-parsers tools are not in the execution path, aborting normalization")
147 |           return false
148 |         end
149 | 
150 |         Dir["#{data}/*_data_*.json.gz"].sort.each do |src|
151 |           dst = File.join(norm, File.basename(src).sub(/\.json\.gz$/, '.mtbl'))
152 |           next if File.exists?(dst)
153 |           dst_tmp = dst + ".tmp"
154 | 
155 |           host_cmd =
156 |             "nice #{gzip_command} -dc #{Shellwords.shellescape(src)} | " +
157 |             "nice inetdata-ct2mtbl -t #{get_tempdir} -m #{(get_total_ram/8.0).to_i} #{Shellwords.shellescape(dst_tmp)}"
158 | 
159 |           log("Processing #{src} with command: #{host_cmd}")
160 |           system(host_cmd)
161 |           File.rename(dst_tmp, dst)
162 |         end
163 |       end
164 | 
165 |     end
166 |   end
167 | end
168 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/czds.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class CZDS < Base
  4 | 
  5 |       def available?
  6 |         config['czds_token'].to_s.length > 0
  7 |       end
  8 | 
  9 |       def download_zone_list
 10 |         target = URI.parse(config['czds_base_url'] + '/en/user-zone-data-urls.json?token=' + config['czds_token'])
 11 | 
 12 |         tries = 0
 13 |         begin
 14 | 
 15 |           tries += 1
 16 |           http = Net::HTTP.new(target.host, target.port)
 17 |           http.use_ssl = true
 18 | 
 19 |           req = Net::HTTP::Get.new(target.request_uri)
 20 |           res = http.request(req)
 21 | 
 22 |           unless (res and res.code.to_i == 200 and res['Content-Type'] == 'application/json')
 23 |             if res
 24 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 25 |             else
 26 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 27 |             end
 28 |           end
 29 | 
 30 |           return JSON.parse(res.body)
 31 | 
 32 |         rescue ::Interrupt
 33 |           raise $!
 34 |         rescue ::Exception
 35 |           if tries < self.max_tries
 36 |             log("Zone list failed: #{$!.class} #{$!}, retrying...")
 37 |             sleep(30)
 38 |             retry
 39 |           else
 40 |             fail("Zone list failed: #{$!.class} #{$!} after #{tries} attempts")
 41 |           end
 42 |         end
 43 |       end
 44 | 
 45 |       def download_file(src, dst)
 46 |         tmp    = dst + ".tmp"
 47 |         target = URI.parse(src)
 48 |         size   = 0
 49 |         ims    = false
 50 |         http   = Net::HTTP.new(target.host, target.port)
 51 |         http.use_ssl = true
 52 | 
 53 |         req = Net::HTTP::Get.new(target.request_uri)
 54 | 
 55 |         if File.exists?(dst)
 56 |           req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
 57 |           ims = true
 58 |         end
 59 | 
 60 |         # Short-circuit the download if the local file exists due to the number of files
 61 |         if ims
 62 |           log(" > Skipped downloading of #{dst} due to existing file on disk")
 63 |           return true
 64 |         end
 65 | 
 66 |         http.request(req) do |res|
 67 | 
 68 |           if ims && res.code.to_i == 304
 69 |             log(" > Skipped downloading of #{dst} due to not modified response")
 70 |             return true
 71 |           end
 72 | 
 73 |           if ims && res['Content-Length']
 74 |             if res['Content-Length'].to_i == File.size(dst)
 75 |               log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
 76 |               return true
 77 |             end
 78 |           end
 79 | 
 80 |           if res.code.to_i != 200
 81 |             log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message}")
 82 |             return true
 83 |           end
 84 | 
 85 |           outp = File.open(tmp, "wb")
 86 | 
 87 |           res.read_body do |chunk|
 88 |             outp.write(chunk)
 89 |             size += chunk.length
 90 |           end
 91 | 
 92 |           outp.close
 93 |         end
 94 | 
 95 |         File.rename(tmp, dst)
 96 | 
 97 |         log(" > Downloading of #{dst} completed with #{size} bytes")
 98 |       end
 99 | 
100 |       def download
101 |         zones = download_zone_list
102 |         zones.each do |url|
103 |           unless url.index('http') == 0
104 |             url = config['czds_base_url'] + url
105 |           end
106 | 
107 |           target  = URI.parse(url)
108 |           zone_id = target.path.split("/").last
109 | 
110 |           date = Time.now.strftime("%Y%m%d")
111 |           ext  = ".txt.gz"
112 |           dir  = File.expand_path(File.join(storage_path, date))
113 |           dst  = File.join(dir, "#{zone_id}#{ext}")
114 | 
115 |           FileUtils.mkdir_p(dir)
116 | 
117 |           log("Dowloading #{dst}")
118 |           download_file(url, dst)
119 |         end
120 |       end
121 | 
122 |       #
123 |       # Normalize the latest CZDS zones
124 |       #
125 |       def normalize
126 |         data = latest_data
127 |         norm = File.expand_path(File.join(data, "..", "normalized"))
128 |         FileUtils.mkdir_p(norm)
129 | 
130 |         date = data.split("/").last
131 | 
132 |         if File.exists?(File.join(norm, "#{date}-czds-names.mtbl"))
133 |           log("Normalized data is already present for #{data}")
134 |           return true
135 |         end
136 | 
137 |         unless inetdata_parsers_available?
138 |           log("The inetdata-parsers tools are not in the execution path, aborting normalization")
139 |           return false
140 |         end
141 | 
142 |         csv_cmd = "nice #{gzip_command} -dc #{data}/*.gz | " +
143 |           "nice inetdata-zone2csv | " +
144 |           "nice inetdata-csvsplit -t #{get_tempdir} -m #{(get_total_ram/8.0).to_i} #{norm}/#{date}-czds"
145 | 
146 |         log("Running #{csv_cmd}\n")
147 |         system(csv_cmd)
148 | 
149 |         [
150 |           "#{norm}/#{date}-czds-names.gz",
151 |           "#{norm}/#{date}-czds-names-inverse.gz"
152 |         ].each do |f|
153 |           o = f.sub(".gz", ".mtbl.tmp")
154 |           mtbl_cmd = "nice #{gzip_command} -dc #{Shellwords.shellescape(f)} | inetdata-dns2mtbl -t #{get_tempdir} -m #{(get_total_ram/8.0).to_i} #{o}"
155 |           log("Running #{mtbl_cmd}")
156 |           system(mtbl_cmd)
157 |           File.rename(o, o.gsub(/\.tmp$/, ''))
158 |         end
159 |       end
160 | 
161 |       #
162 |       # Find the most recent dataset
163 |       #
164 |       def latest_data
165 |         path = Dir["#{storage_path}/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]"].
166 |                sort{|a,b| b.split("/")[-1].to_i <=> a.split("/")[-1].to_i}.
167 |                first
168 | 
169 |         if not path
170 |           raise RuntimeError, "No dataset available for #{self.name}"
171 |         end
172 | 
173 |         path
174 |       end
175 | 
176 |     end
177 |   end
178 | end
179 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/gov.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class GOV < Base
  4 | 
  5 |       def download_file(src, dst)
  6 |         tmp   = dst + ".tmp"
  7 |         ims   = false
  8 |         tries = 0
  9 | 
 10 |         begin
 11 |           tries += 1
 12 |           target = URI.parse(src)
 13 |           size   = 0
 14 |           csize  = nil
 15 | 
 16 |           http = Net::HTTP.new(target.host, target.port)
 17 |           if src.index("https") == 0
 18 |             http.use_ssl = true
 19 |           end
 20 | 
 21 |           req = Net::HTTP::Get.new(target.request_uri)
 22 | 
 23 |           if File.exists?(dst)
 24 |             req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
 25 |             ims = true
 26 |           end
 27 | 
 28 |           http.request(req) do |res|
 29 | 
 30 |             if ims && res.code.to_i == 304
 31 |               log(" > Skipped downloading of #{dst} due to not modified response")
 32 |               return true
 33 |             end
 34 | 
 35 |             if ims && res['Content-Length']
 36 |               if res['Content-Length'].to_i == File.size(dst)
 37 |                 log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
 38 |                 return true
 39 |               end
 40 |             end
 41 | 
 42 |             if res.code.to_i != 200
 43 |               log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message}")
 44 |               return true
 45 |             end
 46 | 
 47 |             log("Download started from #{src} to #{dst}")
 48 |             outp = File.open(tmp, "wb")
 49 |             res.read_body do |chunk|
 50 |               outp.write(chunk)
 51 |               size += chunk.length
 52 |             end
 53 |             outp.close
 54 |           end
 55 | 
 56 |           File.rename(tmp, dst)
 57 | 
 58 |         rescue ::Interrupt
 59 |           raise $!
 60 |         rescue ::Exception
 61 |           if tries < self.max_tries
 62 |             log("Download failed: #{src} -> #{dst} : #{$!.class} #{$!}, retrying...")
 63 |             sleep(30)
 64 |             retry
 65 |           else
 66 |             fail("Download failed: #{src} -> #{dst} : #{$!.class} #{$!} after #{tries} attempts")
 67 |           end
 68 |         end
 69 |         log("Download completed from #{src} to #{dst}")
 70 |       end
 71 | 
 72 |       #
 73 |       # Download the latest data file
 74 |       #
 75 |       def download
 76 |         url = config['gov_domains_url']
 77 |         targ = URI.parse(url)
 78 |         file = datafile_name
 79 |         date = Time.now.strftime("%Y%m%d")
 80 |         dir  = File.expand_path(File.join(storage_path, date))
 81 |         dst  = File.join(dir, file)
 82 |         FileUtils.mkdir_p(dir)
 83 |         download_file(url, dst)
 84 |       end
 85 | 
 86 |       #
 87 |       # Normalize the latest data file
 88 |       #
 89 |       def normalize
 90 |         data = latest_data
 91 |         norm = File.join(data, "normalized")
 92 |         FileUtils.mkdir_p(norm)
 93 | 
 94 |         if File.exists?(File.join(norm, "domains.txt"))
 95 |           log("Normalized data is already present for #{data}")
 96 |           return
 97 |         end
 98 | 
 99 |         src = File.join(data, datafile_name)
100 |         dst = File.join(norm, "domains.txt")
101 |         tmp = dst + ".tmp"
102 | 
103 |         File.open(tmp, "wb") do |fd|
104 |           File.open(src, "rb") do |r|
105 |             r.each_line do |line|
106 |               next if line =~ /^Domain Name,/
107 |               dname = validate_domain(line.strip.downcase.split(",").first.to_s)
108 |               if dname
109 |                 fd.puts dname
110 |               else
111 |                 log("Invalid hostname in #{self.name} : #{src} -> #{line.strip}")
112 |               end
113 |             end
114 |           end
115 |         end
116 |         uniq_sort_file(tmp)
117 |         File.rename(tmp, dst)
118 |       end
119 | 
120 |       #
121 |       # Find the most recent dataset
122 |       #
123 |       def latest_data
124 |         path = Dir["#{storage_path}/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]/#{datafile_name}"].
125 |                sort{|a,b| b.split("/")[-2].to_i <=> a.split("/")[-2].to_i}.
126 |                first
127 | 
128 |         if not path
129 |           raise RuntimeError, "No dataset available for #{self.name}"
130 |         end
131 | 
132 |         File.dirname(path)
133 |       end
134 | 
135 |       #
136 |       # The local name of the data file
137 |       #
138 |       def datafile_name
139 |         "current-full.csv"
140 |       end
141 | 
142 |     end
143 |   end
144 | end
145 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/govuk.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class GOVUK < Base
  4 | 
  5 |       def download_file(src, dst)
  6 |         tmp   = dst + ".tmp"
  7 |         ims   = false
  8 |         tries = 0
  9 | 
 10 |         begin
 11 |           tries += 1
 12 |           target = URI.parse(src)
 13 |           size   = 0
 14 |           csize  = nil
 15 | 
 16 |           http = Net::HTTP.new(target.host, target.port)
 17 |           if src.index("https") == 0
 18 |             http.use_ssl = true
 19 |           end
 20 | 
 21 |           req = Net::HTTP::Get.new(target.request_uri)
 22 | 
 23 |           if File.exists?(dst)
 24 |             req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
 25 |             ims = true
 26 |           end
 27 | 
 28 |           http.request(req) do |res|
 29 | 
 30 |             if ims && res.code.to_i == 304
 31 |               log(" > Skipped downloading of #{dst} due to not modified response")
 32 |               return true
 33 |             end
 34 | 
 35 |             if ims && res['Content-Length']
 36 |               if res['Content-Length'].to_i == File.size(dst)
 37 |                 log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
 38 |                 return true
 39 |               end
 40 |             end
 41 | 
 42 |             if res.code.to_i >= 500 && res.code.to_i < 600
 43 |               raise RuntimeError, "Server Error: #{res.code} #{res.message}"
 44 |             end
 45 | 
 46 |             if res.code.to_i != 200
 47 |               log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message}")
 48 |               return true
 49 |             end
 50 | 
 51 |             log("Download started from #{src} to #{dst}")
 52 |             outp = File.open(tmp, "wb")
 53 |             res.read_body do |chunk|
 54 |               outp.write(chunk)
 55 |               size += chunk.length
 56 |             end
 57 |             outp.close
 58 |           end
 59 | 
 60 |           File.rename(tmp, dst)
 61 | 
 62 |         rescue ::Interrupt
 63 |           raise $!
 64 |         rescue ::Exception
 65 |           if tries < self.max_tries
 66 |             log("Download failed: #{src} -> #{dst} : #{$!.class} #{$!}, retrying...")
 67 |             sleep(30)
 68 |             retry
 69 |           else
 70 |             fail("Download failed: #{src} -> #{dst} : #{$!.class} #{$!} after #{tries} attempts")
 71 |           end
 72 |         end
 73 |         log("Download completed from #{src} to #{dst}")
 74 |       end
 75 | 
 76 |       def download_index(url)
 77 |         target = URI.parse(url)
 78 | 
 79 |         tries = 0
 80 |         begin
 81 | 
 82 |           tries += 1
 83 |           http = Net::HTTP.new(target.host, target.port)
 84 |           http.use_ssl = true if url.index("https") == 0
 85 | 
 86 |           req = Net::HTTP::Get.new(target.request_uri)
 87 |           res = http.request(req)
 88 | 
 89 |           unless (res and res.code.to_i == 200 and res.body.to_s.index("List of .gov.uk domain names"))
 90 |             if res
 91 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 92 |             else
 93 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 94 |             end
 95 |           end
 96 | 
 97 |           res.body.to_s.scan(/a href=\"(\/[^\"]+)\"/m).map{|m| m.first}.select{|m| m =~ /\.csv$/}
 98 | 
 99 |         rescue ::Interrupt
100 |           raise $!
101 |         rescue ::Exception
102 |           if tries < self.max_tries
103 |             log("Index download failed: #{path} #{$!.class} #{$!}, #{$!.backtrace} retrying...")
104 |             sleep(30)
105 |             retry
106 |           else
107 |             fail("Index download failed: #{path} #{$!.class} #{$!} after #{tries} attempts")
108 |           end
109 |         end
110 |       end
111 | 
112 |       def download
113 |         download_index(config['govuk_domains_base_url']).each do |item|
114 |           targ = URI.parse(config['govuk_domains_base_url'])
115 |           targ.path = item
116 |           file = item.split("/").last
117 |           dir  = storage_path
118 |           dst  = File.join(dir, file)
119 |           FileUtils.mkdir_p(dir)
120 |           download_file(targ.to_s, dst)
121 |         end
122 |       end
123 | 
124 |       #
125 |       # Normalize the latest data file
126 |       #
127 |       def normalize
128 |         src  = latest_data
129 |         data = File.dirname(src)
130 |         norm = File.join(data, "normalized")
131 |         FileUtils.mkdir_p(norm)
132 | 
133 |         dst = File.join(norm, "domains.txt")
134 |         tmp = dst + ".tmp"
135 | 
136 |         if File.exists?(dst) && File.mtime(src) <= File.mtime(dst)
137 |           log("Normalized data is already present for #{src}")
138 |           return
139 |         end
140 | 
141 |         File.open(tmp, "wb") do |fd|
142 |           File.open(src, "rb") do |r|
143 |             r.read.gsub(/\r\n?/, "\n").each_line do |line|
144 |               next unless line.index(".gov.uk,")
145 |               dname = validate_domain(line.strip.downcase.split(",").first.to_s)
146 |               if dname
147 |                 fd.puts dname
148 |               else
149 |                 log("Invalid hostname in #{self.name} : #{src} -> #{line.strip}")
150 |               end
151 |             end
152 |           end
153 |         end
154 |         uniq_sort_file(tmp)
155 |         File.rename(tmp, dst)
156 |       end
157 | 
158 |       #
159 |       # Find the most recent dataset
160 |       #
161 |       def latest_data
162 |         files = Dir["#{storage_path}/*.csv"]
163 |         path  = nil
164 | 
165 |         Time.now.year.downto(2013) do |year|
166 |           path = files.select{|f| f =~ /_#{year}\.csv$/}.first
167 |           break if path
168 |         end
169 | 
170 |         path ||= files.first
171 | 
172 |         if not path
173 |           raise RuntimeError, "No dataset available for #{self.name}"
174 |         end
175 | 
176 |         path
177 |       end
178 | 
179 |     end
180 |   end
181 | end
182 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/premiumdrops.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class PremiumDrops < Base
  4 | 
  5 |       def available?
  6 |         config['premiumdrops_username'].to_s.length > 0 &&
  7 |         config['premiumdrops_password'].to_s.length > 0
  8 |       end
  9 | 
 10 |       def obtain_session_id(username, password)
 11 |         target = URI.parse('https://www.premiumdrops.com/user.php')
 12 | 
 13 |         tries = 0
 14 |         session_id = nil
 15 | 
 16 |         begin
 17 |           tries += 1
 18 |           http = Net::HTTP.new(target.host, target.port)
 19 | 
 20 |           # Unpleasantness due to shoddy SSL configuration at premiumdrops
 21 |           http.use_ssl = true
 22 |           http.verify_mode = OpenSSL::SSL::VERIFY_NONE
 23 | 
 24 |           req = Net::HTTP::Post.new(target.path)
 25 |           req.set_form_data({
 26 |             'email' => username,
 27 |             'password' => password,
 28 |             'Submit2' => '  Login  ',
 29 |             'a2' => 'login'
 30 |           })
 31 | 
 32 |           res = http.request(req)
 33 | 
 34 |           unless (res and res.code.to_i == 200 and res['Set-Cookie'].to_s =~ /session=([^\s;]+)/)
 35 |             if res
 36 |               raise RuntimeError.new("#{res.code} #{res.message} #{res['Set-Cookie']} #{res.body}")
 37 |             else
 38 |               raise RuntimeError.new("No response")
 39 |             end
 40 |           end
 41 | 
 42 |           session_id = $1
 43 |         rescue ::Interrupt
 44 |           raise $1
 45 |         rescue ::Exception
 46 |           if tries < self.max_tries
 47 |             log("Authentication failed: #{$!.class} #{$!}, retrying...")
 48 |             sleep(30)
 49 |             retry
 50 |           else
 51 |             fail("Authentication failed: #{$!.class} #{$!} after #{tries} attempts")
 52 |           end
 53 |         end
 54 | 
 55 |         session_id
 56 |       end
 57 | 
 58 |       def download_file(session_id, src, dst)
 59 |         tmp    = dst + ".tmp"
 60 |         size   = 0
 61 |         ims    = false
 62 | 
 63 |         http = Net::HTTP.new('www.premiumdrops.com', 443)
 64 |         http.use_ssl = true
 65 |         http.verify_mode = OpenSSL::SSL::VERIFY_NONE
 66 | 
 67 |         src = "/" + src
 68 | 
 69 |         req = Net::HTTP::Get.new(src)
 70 |         req['Cookie'] = 'session=' + session_id
 71 | 
 72 |         if File.exists?(dst)
 73 |           req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
 74 |           ims = true
 75 |         end
 76 | 
 77 |         http.request(req) do |res|
 78 | 
 79 |           if ims && res.code.to_i == 304
 80 |             log(" > Skipped downloading of #{dst} due to not modified response")
 81 |             return true
 82 |           end
 83 | 
 84 |           if ims && res['Content-Length']
 85 |             if res['Content-Length'].to_i == File.size(dst)
 86 |               log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
 87 |               return true
 88 |             end
 89 |           end
 90 | 
 91 |           if res.code.to_i != 200
 92 |             log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message}")
 93 |             return true
 94 |           end
 95 | 
 96 |           outp = File.open(tmp, "wb")
 97 |           res.read_body do |chunk|
 98 |             outp.write(chunk)
 99 |             size += chunk.length
100 |           end
101 |           outp.close
102 |         end
103 | 
104 |         File.rename(tmp, dst)
105 | 
106 |         log(" > Downloading of #{dst} completed with #{size} bytes")
107 |       end
108 | 
109 |       def download
110 |         config['premiumdrops_urls'].each do |url|
111 |           session_id = obtain_session_id(config['premiumdrops_username'], config['premiumdrops_password'])
112 |           target = URI(url)
113 |           params = CGI.parse(target.query)
114 | 
115 |           date = Time.now.strftime("%Y%m%d")
116 |           path = nil
117 |           zone = params['f'].first
118 |           format = case params['a'].first
119 |             when 'request_full_zone'
120 |               'full'
121 |             when 'request_zone'
122 |               'names'
123 |             when 'request_zone_changes'
124 |               params['t'].first == 'diff' ? 'del' : 'add'
125 |             end
126 | 
127 |           ext = ['full', 'names'].include?(format) ? '.gz' : ''
128 |           dir = File.expand_path(File.join(storage_path, date))
129 |           dst = File.join(dir,  "#{zone}_#{format}#{ext}")
130 | 
131 |           FileUtils.mkdir_p(dir)
132 | 
133 |           http = Net::HTTP.new(target.host, target.port)
134 |           http.use_ssl = true
135 |           http.verify_mode = OpenSSL::SSL::VERIFY_NONE
136 | 
137 |           req = Net::HTTP::Get.new(target.path + '?' + target.query)
138 |           req['Cookie'] = 'session=' + session_id
139 | 
140 |           res = http.request(req)
141 | 
142 |           unless res['Location']
143 |             fail("No redirect for download of #{url}")
144 |           end
145 | 
146 |           log("Dowloading #{dst}")
147 |           download_file(session_id, res['Location'], dst)
148 |         end
149 |       end
150 | 
151 |       #
152 |       # Normalize the latest premium drops zones
153 |       #
154 |       def normalize
155 |         data = latest_data
156 |         norm = File.expand_path(File.join(data, "..", "normalized"))
157 |         FileUtils.mkdir_p(norm)
158 | 
159 |         date = data.split("/").last
160 | 
161 |         if File.exists?(File.join(norm, "#{date}-com-names.mtbl"))
162 |           log("Normalized data is already present for #{data}")
163 |           return true
164 |         end
165 | 
166 |         unless inetdata_parsers_available?
167 |           log("The inetdata-parsers tools are not in the execution path, aborting normalization")
168 |           return false
169 |         end
170 | 
171 |         zone_index = 0
172 |         zone_files = Dir["#{data}/*_full.gz"]
173 |         zone_files.each do |zone_file|
174 | 
175 |           zone_index += 1
176 |           log("Extracting records from [#{zone_index}/#{zone_files.length}] #{zone_file}...")
177 |           origin = zone_file.split('/').last.split("_").first
178 | 
179 |           csv_cmd = "nice " +
180 |             ((origin == "sk") ? "cat" : "#{gzip_command} -dc") + " #{Shellwords.shellescape(zone_file)} | " +
181 |             "nice inetdata-zone2csv | " +
182 |             "nice inetdata-csvsplit -t #{get_tempdir} -m #{(get_total_ram/8.0).to_i} #{norm}/#{date}-#{origin}"
183 | 
184 |           log("Running #{csv_cmd}\n")
185 |           system(csv_cmd)
186 | 
187 |           [
188 |             "#{norm}/#{date}-#{origin}-names.gz",
189 |             "#{norm}/#{date}-#{origin}-names-inverse.gz"
190 |           ].each do |f|
191 |             o = f.sub(".gz", ".mtbl.tmp")
192 |             mtbl_cmd = "nice #{gzip_command} -dc #{Shellwords.shellescape(f)} | nice inetdata-dns2mtbl -t #{get_tempdir} -m #{(get_total_ram/8.0).to_i} #{o}"
193 |             log("Running #{mtbl_cmd}")
194 |             system(mtbl_cmd)
195 |             File.rename(o, o.gsub(/\.tmp$/, ''))
196 |           end
197 |         end
198 |       end
199 | 
200 |       #
201 |       # Find the most recent dataset
202 |       #
203 |       def latest_data
204 |         path = Dir["#{storage_path}/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]"].
205 |                sort{|a,b| b.split("/")[-1].to_i <=> a.split("/")[-1].to_i}.
206 |                first
207 | 
208 |         if not path
209 |           raise RuntimeError, "No dataset available for #{self.name}"
210 |         end
211 | 
212 |         path
213 |       end
214 | 
215 |     end
216 |   end
217 | end
218 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/rir.rb:
--------------------------------------------------------------------------------
 1 | module InetData
 2 |   module Source
 3 |     class RIR < Base
 4 | 
 5 |       def download_file(src, dst)
 6 |         tmp   = dst + ".tmp"
 7 |         ims   = false
 8 |         tries = 0
 9 | 
10 |         begin
11 |           tries += 1
12 |           target = URI.parse(src)
13 |           size   = 0
14 |           csize  = nil
15 | 
16 |           http = Net::HTTP.new(target.host, target.port)
17 |           if src.index("https") == 0
18 |             http.use_ssl = true
19 |           end
20 | 
21 |           req = Net::HTTP::Get.new(target.request_uri)
22 | 
23 |           if File.exists?(dst)
24 |             req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
25 |             ims = true
26 |           end
27 | 
28 |           http.request(req) do |res|
29 | 
30 |             if ims && res.code.to_i == 304
31 |               log(" > Skipped downloading of #{dst} due to not modified response")
32 |               return true
33 |             end
34 | 
35 |             if ims && res['Content-Length']
36 |               if res['Content-Length'].to_i == File.size(dst)
37 |                 log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
38 |                 return true
39 |               end
40 |             end
41 | 
42 |             if res.code.to_i != 200
43 |               log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message}")
44 |               return true
45 |             end
46 | 
47 |             log("Download started from #{src} to #{dst}")
48 |             outp = File.open(tmp, "wb")
49 |             res.read_body do |chunk|
50 |               outp.write(chunk)
51 |               size += chunk.length
52 |             end
53 |             outp.close
54 |           end
55 | 
56 |           File.rename(tmp, dst)
57 | 
58 |         rescue ::Interrupt
59 |           raise $!
60 |         rescue ::Exception
61 |           if tries < self.max_tries
62 |             log("Download failed: #{src} -> #{dst} : #{$!.class} #{$!}, retrying...")
63 |             sleep(30)
64 |             retry
65 |           else
66 |             fail("Download failed: #{src} -> #{dst} : #{$!.class} #{$!} after #{tries} attempts")
67 |           end
68 |         end
69 |         log("Download completed from #{src} to #{dst}")
70 |       end
71 | 
72 |       def download
73 |         config['rir_delegation_urls'].each do |url|
74 |           targ = URI.parse(url)
75 |           file = url.split("/").last
76 |           date = Time.now.strftime("%Y%m%d")
77 |           dir  = File.expand_path(File.join(storage_path, date))
78 |           dst  = File.join(dir, file)
79 |           FileUtils.mkdir_p(dir)
80 |           download_file(url, dst)
81 |         end
82 |       end
83 | 
84 |       #
85 |       # RIR files are considered already normalized
86 |       #
87 |       def normalize
88 |       end
89 | 
90 |     end
91 |   end
92 | end
93 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/sonar.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class Sonar < Base
  4 | 
  5 |       def download_file(src, dst,redirect_count=0)
  6 |         tmp    = dst + ".tmp"
  7 |         target = URI.parse(src)
  8 |         size   = 0
  9 |         ims    = false
 10 |         http   = Net::HTTP.new(target.host, target.port)
 11 | 
 12 |         if src.index("https") == 0
 13 |           http.use_ssl = true
 14 |         end
 15 | 
 16 |         req = Net::HTTP::Get.new(target.request_uri)
 17 | 
 18 |         if File.exists?(dst)
 19 |           req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
 20 |           ims = true
 21 |         end
 22 | 
 23 |         http.request(req) do |res|
 24 | 
 25 |           if ims && res.code.to_i == 304
 26 |             log(" > Skipped downloading of #{dst} due to not modified response")
 27 |             return true
 28 |           end
 29 | 
 30 |           if ims && res['Content-Length']
 31 |             if res['Content-Length'].to_i == File.size(dst)
 32 |               log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
 33 |               return true
 34 |             end
 35 |           end
 36 | 
 37 |           if [301, 302].include?(res.code.to_i)
 38 | 
 39 |             if redirect_count > 3
 40 |               log(" > Skipped downloading of #{dst} due to rediret count being over limit: #{redirect_count}")
 41 |               return true
 42 |             end
 43 | 
 44 |             new_src = res['Location'].to_s
 45 | 
 46 |             if new_src.length == 0
 47 |               log(" > Skipped downloading of #{dst} due to server redirect with no location")
 48 |               return true
 49 |             end
 50 | 
 51 |             log(" > Download of #{src} moved to #{new_src}...")
 52 |             return download_file(new_src, dst, redirect_count + 1)
 53 |           end
 54 | 
 55 |           if res.code.to_i != 200
 56 |             log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message} #{res['Location']}")
 57 |             return true
 58 |           end
 59 | 
 60 |           outp = File.open(tmp, "wb")
 61 | 
 62 |           res.read_body do |chunk|
 63 |             outp.write(chunk)
 64 |             size += chunk.length
 65 |           end
 66 | 
 67 |           outp.close
 68 |         end
 69 | 
 70 |         File.rename(tmp, dst)
 71 | 
 72 |         log(" > Downloading of #{dst} completed with #{size} bytes")
 73 |       end
 74 | 
 75 |       def download_index(dset)
 76 |         target = URI.parse(config['sonar_base_url'] + dset)
 77 |         tries  = 0
 78 |         begin
 79 | 
 80 |           tries += 1
 81 |           http   = Net::HTTP.new(target.host, target.port)
 82 |           http.use_ssl = true
 83 | 
 84 |           req = Net::HTTP::Get.new(target.request_uri)
 85 |           res = http.request(req)
 86 | 
 87 |           unless (res and res.code.to_i == 200 and res.body.to_s.index('SHA1-Fingerprint'))
 88 |             if res
 89 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 90 |             else
 91 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 92 |             end
 93 |           end
 94 | 
 95 |           links = []
 96 |           res.body.scan(/href=\"(#{dset}\d+\-\d+\-\d+\-\d+\-[^\"]+)\"/).each do |link|
 97 |             link = link.first
 98 |             if link =~ /\.json.gz/
 99 |               links << ( config['sonar_base_url'] + link )
100 |             end
101 |           end
102 | 
103 |           return links
104 | 
105 |         rescue ::Interrupt
106 |           raise $!
107 |         rescue ::Exception
108 |           if tries < self.max_tries
109 |             log("Index failed: #{$!.class} #{$!}, retrying...")
110 |             sleep(30)
111 |             retry
112 |           else
113 |             fail("Index failed: #{$!.class} #{$!} after #{tries} attempts")
114 |           end
115 |         end
116 |       end
117 | 
118 |       def download_fdns_index
119 |         download_index('/sonar.fdns_v2/')
120 |       end
121 | 
122 |       def download_rdns_index
123 |         download_index('/sonar.rdns_v2/')
124 |       end
125 | 
126 |       def download
127 |         dir  = storage_path
128 |         FileUtils.mkdir_p(dir)
129 | 
130 |         fdns_links = download_fdns_index
131 |         rdns_links = download_rdns_index
132 | 
133 |         queue = []
134 |         queue += rdns_links
135 |         queue += fdns_links
136 | 
137 |         queue.each do |url|
138 |           dst = File.join(dir, url.split("/").last)
139 |           download_file(url, dst)
140 |         end
141 |       end
142 | 
143 |       def normalize
144 |         data = storage_path
145 |         norm = File.join(data, "normalized")
146 |         FileUtils.mkdir_p(norm)
147 | 
148 |         unless inetdata_parsers_available?
149 |           log("The inetdata-parsers tools are not in the execution path, aborting normalization")
150 |           return false
151 |         end
152 | 
153 |         sonar_files = sonar_datafiles
154 |         sonar_files.each do |sonar_file|
155 |           sonar_mtbl = File.join(norm, File.basename(sonar_file).sub(".json.gz", "-names-inverse.mtbl"))
156 |           if File.exists?(sonar_mtbl) && File.size(sonar_mtbl) > 0
157 |             next
158 |           end
159 | 
160 |           output_base = File.join(norm, File.basename(sonar_file).sub(".json.gz", ""))
161 |           csv_cmd = "nice #{gzip_command} -dc #{Shellwords.shellescape(sonar_file)} | nice inetdata-sonardnsv2-split -t #{get_tempdir} -m #{(get_total_ram/8.0).to_i} #{output_base}"
162 |           log("Running #{csv_cmd}")
163 |           system(csv_cmd)
164 |           [
165 |             "#{output_base}-names.gz",
166 |             "#{output_base}-names-inverse.gz"
167 |           ].each do |f|
168 |             o = f.sub(".gz", ".mtbl.tmp")
169 |             mtbl_cmd = "nice #{gzip_command} -dc #{Shellwords.shellescape(f)} | inetdata-dns2mtbl -t #{get_tempdir} -m #{(get_total_ram/8.0).to_i} #{o}"
170 |             log("Running #{mtbl_cmd}")
171 |             system(mtbl_cmd)
172 |             File.rename(o, o.gsub(/\.tmp$/, ''))
173 |           end
174 |         end
175 |       end
176 | 
177 |       #
178 |       # Find all sonar datafiles
179 |       #
180 |       def sonar_datafiles
181 |         paths = Dir["#{storage_path}/*.json.gz"].sort { |a,b|
182 |           File.basename(a).split(/[^\d\-]+/).first.gsub("-", '')[0,8].to_i <=>
183 |           File.basename(b).split(/[^\d\-]+/).first.gsub("-", '')[0,8].to_i
184 |         }
185 |       end
186 | 
187 |       #
188 |       # Find the most recent normalized dataset
189 |       #
190 |       def latest_normalized_data(dtype)
191 |         path = Dir["#{storage_path}/normalized/*#{dtype}"].sort { |a,b|
192 |           File.basename(b).split(/[^\d\-]+/).first.gsub("-", '')[0,8].to_i <=>
193 |           File.basename(a).split(/[^\d\-]+/).first.gsub("-", '')[0,8].to_i
194 |         }.first
195 | 
196 |         if not path
197 |           raise RuntimeError, "No #{dtype} normalized_dataset available for #{self.name}"
198 |         end
199 | 
200 |         path
201 |       end
202 | 
203 |       def latest_normalized_fdns_names_mtbl
204 |         latest_normalized_data("-fdns-names.mtbl")
205 |       end
206 | 
207 |       def latest_normalized_fdns_names_inverse_mtbl
208 |         latest_normalized_data("-fdns-names-inverse.mtbl")
209 |       end
210 | 
211 |       def latest_normalized_rdns_names_mtbl
212 |         latest_normalized_data("-rdns.mtbl")
213 |       end
214 | 
215 |       def latest_normalized_rdns_names_inverse_mtbl
216 |         latest_normalized_data("-rdns-inverse.mtbl")
217 |       end
218 |     end
219 |   end
220 | end
221 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/whoisxmlapi.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class WhoisXMLAPI < Base
  4 | 
  5 |       def available?
  6 |         config['whoisxmlapi_username'].to_s.length > 0 &&
  7 |         config['whoisxmlapi_password'].to_s.length > 0
  8 |       end
  9 | 
 10 |       def download_index(prefix)
 11 |         target = URI.parse(config['whoisxmlapi_base_url'] + prefix)
 12 | 
 13 |         tries = 0
 14 |         begin
 15 | 
 16 |           tries += 1
 17 |           http = Net::HTTP.new(target.host, target.port)
 18 |           http.use_ssl = true if config['whoisxmlapi_base_url'].index("https") == 0
 19 | 
 20 |           req = Net::HTTP::Get.new(target.request_uri)
 21 |           req.basic_auth(config['whoisxmlapi_username'], config['whoisxmlapi_password'])
 22 | 
 23 |           res = http.request(req)
 24 | 
 25 |           unless (res and res.code.to_i == 200 and res.body.to_s =~ /a href=\"(\d+_\d+_\d+_|full_)/)
 26 |             if res
 27 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 28 |             else
 29 |               raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
 30 |             end
 31 |           end
 32 | 
 33 |           res.body.to_s.scan(/a href=\"((\d+_\d+_\d+_|full_)[^\"]+)\"/m).map{|m| m.first}
 34 | 
 35 |         rescue ::Interrupt
 36 |           raise $!
 37 |         rescue ::Exception
 38 |           if tries < self.max_tries
 39 |             log("Index download failed: #{prefix} #{$!.class} #{$!}, #{$!.backtrace} retrying...")
 40 |             sleep(30)
 41 |             retry
 42 |           else
 43 |             fail("Index download failed: #{prefix} #{$!.class} #{$!} after #{tries} attempts")
 44 |           end
 45 |         end
 46 |       end
 47 | 
 48 |       def download_file(src, dst)
 49 |         target = URI.parse(src)
 50 |         size   = 0
 51 |         ims    = false
 52 |         tmp    = dst + ".tmp"
 53 | 
 54 |         http = Net::HTTP.new(target.host, target.port)
 55 |         http.use_ssl = true if src.index("https") == 0
 56 | 
 57 |         req = Net::HTTP::Get.new(target.request_uri)
 58 |         req.basic_auth(config['whoisxmlapi_username'], config['whoisxmlapi_password'])
 59 | 
 60 |         if File.exists?(dst)
 61 |           req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
 62 |           ims = true
 63 |         end
 64 | 
 65 |         # Short-circuit the download if the local file exists due to the number of files
 66 |         if ims
 67 |           log(" > Skipped downloading of #{dst} due to existing file on disk")
 68 |           return true
 69 |         end
 70 | 
 71 |         http.request(req) do |res|
 72 | 
 73 |           if ims && res.code.to_i == 304
 74 |             log(" > Skipped downloading of #{dst} due to not modified response")
 75 |             return true
 76 |           end
 77 | 
 78 |           if ims && res['Content-Length']
 79 |             if res['Content-Length'].to_i == File.size(dst)
 80 |               log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
 81 |               return true
 82 |             end
 83 |           end
 84 | 
 85 |           if res.code.to_i != 200
 86 |             log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message}")
 87 |             return true
 88 |           end
 89 | 
 90 |           outp = File.open(tmp, "wb")
 91 | 
 92 |           res.read_body do |chunk|
 93 |             outp.write(chunk)
 94 |             size += chunk.length
 95 |           end
 96 | 
 97 |           outp.close
 98 |         end
 99 | 
100 |         File.rename(tmp, dst)
101 | 
102 |         log(" > Downloading of #{dst} completed with #{size} bytes")
103 |       end
104 | 
105 |       def download
106 |         tries = 0
107 | 
108 |         config['whoisxmlapi_datasets'].each_pair do |dname,prefix|
109 |           files = download_index(prefix)
110 |           files.each do |fname|
111 |             url = config['whoisxmlapi_base_url'] + prefix + fname
112 | 
113 |             target  = URI.parse(url)
114 | 
115 |             tld  = nil
116 |             date = nil
117 | 
118 |             case fname
119 |             when /^full_(\d+_\d+_\d+)_(.*)\.csv\.gz/
120 |               tld  = $2
121 |               date = $1.gsub(/[^[0-9]]/, '')
122 |             when /^full_(.*)_(\d+[_\-]\d+[\-_]\d+).csv.gz/
123 |               tld = $1
124 |               date = $2.gsub(/[^[0-9]]/, '')
125 |             when /^(\d+_\d+_\d+)_(.*)\.csv\.gz/
126 |               tld  = $2
127 |               date = $1.gsub(/[^[0-9]]/, '')
128 |             else
129 |               log("Unknown file name format: #{fname}")
130 |               next
131 |             end
132 | 
133 |             dir  = File.expand_path(File.join(storage_path, dname, date))
134 |             dst  = File.join(dir, fname.gsub("/", ""))
135 | 
136 |             FileUtils.mkdir_p(dir)
137 | 
138 |             log("Dowloading #{dst}")
139 |             begin
140 |               download_file(url, dst)
141 |             rescue ::Interrupt
142 |               raise $!
143 |             rescue ::Exception
144 |               if tries < self.max_tries
145 |                 log("Download failed: #{url} #{$!.class} #{$!}, #{$!.backtrace} retrying...")
146 |                 sleep(30)
147 |                 retry
148 |               else
149 |                 fail("Download failed: #{url} #{$!.class} #{$!} after #{tries} attempts")
150 |               end
151 |             end
152 | 
153 |           end
154 |         end
155 |       end
156 | 
157 |     end
158 |   end
159 | end
160 | 


--------------------------------------------------------------------------------
/lib/inetdata/source/wwwsio.rb:
--------------------------------------------------------------------------------
  1 | module InetData
  2 |   module Source
  3 |     class WWWSIO < Base
  4 | 
  5 |       def available?
  6 |         config['wwwsio_username'].to_s.length > 0 &&
  7 |         config['wwwsio_password'].to_s.length > 0
  8 |       end
  9 | 
 10 |       def download_file(src, dst)
 11 |         tmp   = dst + ".tmp"
 12 |         ims   = false
 13 |         tries = 0
 14 | 
 15 |         begin
 16 |           tries += 1
 17 |           target = URI.parse(src)
 18 |           size   = 0
 19 |           csize  = nil
 20 | 
 21 |           http = Net::HTTP.new(target.host, target.port)
 22 |           if src.index("https") == 0
 23 |             http.use_ssl = true
 24 |           end
 25 | 
 26 |           req = Net::HTTP::Get.new(target.request_uri)
 27 | 
 28 |           if File.exists?(dst)
 29 |             req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
 30 |             ims = true
 31 |           end
 32 | 
 33 |           # Short-circuit the download if the local file exists due to the number of files
 34 |           if ims
 35 |             log(" > Skipped downloading of #{dst} due to existing file on disk")
 36 |             return true
 37 |           end
 38 | 
 39 |           http.request(req) do |res|
 40 | 
 41 |             if ims && res.code.to_i == 304
 42 |               log(" > Skipped downloading of #{dst} due to not modified response")
 43 |               return true
 44 |             end
 45 | 
 46 |             if ims && res['Content-Length']
 47 |               if res['Content-Length'].to_i == File.size(dst)
 48 |                 log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
 49 |                 return true
 50 |               end
 51 |             end
 52 | 
 53 |             if res.code.to_i != 200
 54 |               log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message}")
 55 |               return true
 56 |             end
 57 | 
 58 |             log("Download started from #{src} to #{dst}")
 59 |             outp = File.open(tmp, "wb")
 60 |             res.read_body do |chunk|
 61 |               outp.write(chunk)
 62 |               size += chunk.length
 63 |             end
 64 |             outp.close
 65 |           end
 66 | 
 67 |           File.rename(tmp, dst)
 68 | 
 69 |         rescue ::Interrupt
 70 |           raise $!
 71 |         rescue ::Exception
 72 |           if tries < self.max_tries
 73 |             log("Download failed: #{src} -> #{dst} : #{$!.class} #{$!}, retrying...")
 74 |             sleep(30)
 75 |             retry
 76 |           else
 77 |             fail("Download failed: #{src} -> #{dst} : #{$!.class} #{$!} after #{tries} attempts")
 78 |           end
 79 |         end
 80 |         log("Download completed from #{src} to #{dst}")
 81 |       end
 82 | 
 83 |       def download
 84 |         %W{ full new deleted }.each do |list|
 85 |           url = config['wwwsio_base_url'] + "/#{list}/all_zones/#{(config['wwwsio_username'])}/#{(config['wwwsio_password'])}"
 86 |           targ = URI.parse(url)
 87 |           file = "all_zones_#{list}.txt"
 88 |           date = Time.now.strftime("%Y%m%d")
 89 |           dir  = File.expand_path(File.join(storage_path, date))
 90 |           dst  = File.join(dir, file)
 91 |           tmp = dst + ".tmp"
 92 |           tmp_gz = tmp + ".gz"
 93 |           dst_gz = dst + ".gz"
 94 | 
 95 |           FileUtils.mkdir_p(dir)
 96 |           download_file(url, tmp)
 97 |           uniq_sort_file(tmp)
 98 |           system("nice pigz #{tmp}")
 99 |           File.rename(tmp_gz, dst_gz)
100 |         end
101 |       end
102 | 
103 |       #
104 |       # Normalize the latest data file
105 |       #
106 |       def normalize
107 |         data = latest_data
108 |         norm = File.join(data, "normalized")
109 |         FileUtils.mkdir_p(norm)
110 | 
111 |         %W{ full new deleted }.each do |list|
112 |           src = File.join(data, "all_zones_#{list}.txt.gz")
113 |           dst = File.join(norm, "all_zones_#{list}.txt.gz")
114 |           # Symlink since the source file is already sorted and compressed
115 |           FileUtils.ln_sf("../all_zones_#{list}.txt.gz", dst)
116 |         end
117 |       end
118 | 
119 |       #
120 |       # Find the most recent dataset
121 |       #
122 |       def latest_data
123 |         path = Dir["#{storage_path}/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]"].
124 |                sort{|a,b| b.split("/")[-1].to_i <=> a.split("/")[-1].to_i}.
125 |                first
126 | 
127 |         if not path
128 |           raise RuntimeError, "No dataset available for #{self.name}"
129 |         end
130 | 
131 |         path
132 |       end
133 |     end
134 |   end
135 | end
136 | 


--------------------------------------------------------------------------------
/logs/.keep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------