├── .gitignore ├── scripts ├── uninstall.sh ├── install.sh └── _geoloc_update.sh ├── configure ├── Makefile ├── geoloc ├── hash_map.hpp ├── macros.hpp ├── error.hpp ├── args.hpp ├── error.cpp ├── connector.hpp ├── string_table.hpp ├── csv.hpp ├── blocks.hpp ├── etl.hpp ├── test.cpp ├── pipeline.hpp ├── asns.hpp ├── locations.hpp ├── geoloc.cpp ├── serialization.hpp └── query.hpp ├── LICENSE ├── README.md └── outline.md /.gitignore: -------------------------------------------------------------------------------- 1 | bin/* 2 | tmp/* 3 | -------------------------------------------------------------------------------- /scripts/uninstall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -u 5 | 6 | if [[ -e ~/bin/geoloc ]]; then 7 | rm ~/bin/geoloc 8 | fi 9 | 10 | if [[ -e ~/bin/_geoloc_update.sh ]]; then 11 | rm ~/bin/_geoloc_update.sh 12 | fi 13 | 14 | if [[ -e ~/var/db/geoloc/geodata.bin ]]; then 15 | rm ~/var/db/geoloc/geodata.bin 16 | fi 17 | -------------------------------------------------------------------------------- /configure: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -u 5 | 6 | function check_for() 7 | { 8 | printf "checking for $1 ... " 9 | 10 | if ! which "$1"; then 11 | 12 | printf "failed\n" 13 | 14 | echo "ERROR - could not find $1" 15 | exit 1 16 | fi 17 | } 18 | 19 | # this isn't a real autoconf script. it's much simpler. 20 | 21 | check_for iconv 22 | check_for unzip 23 | check_for curl 24 | check_for make 25 | check_for c++ 26 | 27 | echo ok 28 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | $(shell mkdir -p bin; mkdir -p tmp) 2 | 3 | DEPS := $(shell echo Makefile geoloc/*.cpp geoloc/*.hpp) 4 | 5 | all: bin/geoloc bin/test 6 | 7 | bin/geoloc: $(DEPS) 8 | c++ -std=c++03 -O2 -Wall -Werror geoloc/geoloc.cpp geoloc/error.cpp \ 9 | -o bin/geoloc 10 | 11 | bin/test: $(DEPS) 12 | c++ -std=c++03 -g -Wall -Werror geoloc/test.cpp geoloc/error.cpp \ 13 | -o bin/test 14 | 15 | .PHONY: test install uninstall clean 16 | 17 | test: bin/test 18 | ./bin/test 19 | 20 | install: bin/geoloc 21 | ./scripts/install.sh 22 | 23 | uninstall: 24 | ./scripts/uninstall.sh 25 | 26 | clean: 27 | rm -rf bin 28 | -------------------------------------------------------------------------------- /geoloc/hash_map.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This file contains macros to paper over differences between older/newer 9 | * compilers as far as using std::unordered_map is concerned. 10 | */ 11 | 12 | #ifndef HASH_MAP_HPP_182989E8 13 | #define HASH_MAP_HPP_182989E8 14 | 15 | #ifndef __linux 16 | #include 17 | 18 | #define hash_map std::unordered_map 19 | 20 | #else 21 | #include 22 | #define hash_map std::tr1::unordered_map 23 | #endif 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /scripts/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -u 5 | 6 | # geoloc installer script - responsible for installing binaries and scripts 7 | # and ensuring that a database is created 8 | 9 | # this script is meant to be called from make install 10 | 11 | if [[ ! -e ~/bin ]]; then 12 | mkdir -p ~/bin 13 | fi 14 | 15 | cp bin/geoloc ~/bin 16 | cp scripts/_geoloc_update.sh ~/bin 17 | 18 | ~/bin/_geoloc_update.sh 19 | 20 | # do a self test 21 | 22 | result=$(~/bin/geoloc -q 8.8.8.8) 23 | expected="8.8.8.8 US % % 37.7510 -97.8220 AS15169 Google+LLC" 24 | 25 | if [[ "$result" != "$expected" ]]; then 26 | echo "ERROR: unexpected self-test result" 27 | exit 1 28 | fi 29 | -------------------------------------------------------------------------------- /geoloc/macros.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-13 7 | * 8 | * This file contains macros that are commonly used throughout the source. 9 | */ 10 | 11 | #ifndef MACROS_HPP_942AF38F 12 | #define MACROS_HPP_942AF38F 13 | 14 | #define UNUSED(x) (void)(x) 15 | 16 | #define DISALLOW_COPY_AND_ASSIGN(TypeName) \ 17 | TypeName(const TypeName&); \ 18 | void operator=(const TypeName&) 19 | 20 | inline const char* get_endian() 21 | { 22 | union 23 | { 24 | unsigned u; 25 | char s[4]; 26 | } data; 27 | 28 | data.u = 0x01020304; 29 | 30 | return data.s[0] == 0x04 ? "little" : "big"; 31 | } 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /geoloc/error.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This file contains the declarations needed to support logging and error 9 | * handling. 10 | */ 11 | 12 | #ifndef ERROR_HPP_B43A43DE 13 | #define ERROR_HPP_B43A43DE 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | void fatal_error(const char* file, unsigned line, const char* fmt, ...); 20 | void log_context(const char* file, unsigned line, const char* fmt, ...); 21 | 22 | #define REL_ASSERT(condition) { if (!(condition)) fatal_error(__FILE__, __LINE__, "assert failed (" #condition ")"); } 23 | #define FATAL_ERROR(...) { fatal_error(__FILE__, __LINE__, __VA_ARGS__); } 24 | #define LOG_CONTEXT(...) { log_context(__FILE__, __LINE__, __VA_ARGS__); } 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /geoloc/args.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-13 7 | * 8 | * This file contains a command line argument helper class. 9 | */ 10 | 11 | #ifndef ARGS_HPP_B4B54EC4 12 | #define ARGS_HPP_B4B54EC4 13 | 14 | struct Args 15 | { 16 | Args(int argc, char** argv) 17 | : 18 | argc_(argc), 19 | argv_(argv), 20 | idx_(1) 21 | { 22 | } 23 | 24 | const char* peek() 25 | { 26 | if (empty()) 27 | { 28 | return 0; 29 | } 30 | 31 | return argv_[idx_]; 32 | } 33 | 34 | const char* pop() 35 | { 36 | if (empty()) 37 | { 38 | return 0; 39 | } 40 | 41 | const char* ret = argv_[idx_]; 42 | idx_++; 43 | 44 | return ret; 45 | } 46 | 47 | bool empty() 48 | { 49 | return idx_ == argc_; 50 | } 51 | 52 | int argc_; 53 | char** argv_; 54 | int idx_; 55 | }; 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Jason McSweeney 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of Jason McSweeney nor the names of its contributors 15 | may be used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /scripts/_geoloc_update.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -u 5 | 6 | # geoloc database updater script 7 | # uses wget to download relevant csv from maxmind and transforms to UTF-8 8 | # finally it builds the db with geoloc --import 9 | 10 | tok=$(dd if=/dev/urandom bs=12 count=1 2>/dev/null | xxd -p) 11 | 12 | mkdir -p /tmp/geoloc.$tok 13 | echo "staging area /tmp/geoloc.$tok" 14 | 15 | cd /tmp/geoloc.$tok 16 | 17 | echo "downloading from loadzero" 18 | echo "warning: This geoip database is old (from 2017)" 19 | echo "warning: For experimental use only" 20 | 21 | curl -L -O http://blog.loadzero.com/assets/geodata.bin.gz 22 | gunzip geodata.bin.gz 23 | mkdir -p ~/var/db/geoloc 24 | mv geodata.bin ~/var/db/geoloc 25 | rm -rf "/tmp/geoloc.$tok" 26 | 27 | # Deprecated as of Jan 2019 28 | # 29 | #wget https://geolite.maxmind.com/download/geoip/database/GeoLiteCity_CSV/GeoLiteCity-latest.zip 30 | #wget https://download.maxmind.com/download/geoip/database/asnum/GeoIPASNum2.zip 31 | # 32 | #blocks_file=$(unzip -l GeoLiteCity-latest.zip |grep City.Blocks.csv | awk '{print $4}') 33 | #dir=$(dirname $blocks_file) 34 | #frag="" 35 | # 36 | #if ! echo $dir | grep "^GeoLiteCity_........$"; then 37 | # echo error zip file "$blocks_file" looks weird 38 | # exit 1 39 | #fi 40 | # 41 | #unzip GeoLiteCity-latest.zip 42 | #unzip GeoIPASNum2.zip 43 | # 44 | #iconv -f ISO-8859-15 -t UTF-8 $dir/GeoLiteCity-Blocks.csv > blocks.csv 45 | #iconv -f ISO-8859-15 -t UTF-8 $dir/GeoLiteCity-Location.csv > location.csv 46 | #iconv -f ISO-8859-15 -t UTF-8 GeoIPASNum2.csv > asnum.csv 47 | # 48 | #mkdir -p ~/var/db/geoloc 49 | # 50 | ## create a new db file 51 | #~/bin/geoloc --import /tmp/geoloc.$tok -o data.bin 52 | # 53 | ## swap into place atomically only on success 54 | # 55 | #mv data.bin ~/var/db/geoloc/geodata.bin 56 | # 57 | #rm -rf "/tmp/geoloc.$tok" 58 | -------------------------------------------------------------------------------- /geoloc/error.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This file contains the logging and error handling implementation. We use a 9 | * mirrored ring buffer to handle log messages. The ring buffer is dumped out 10 | * to stderr when an assert or fatal error fires. 11 | * 12 | * Note - log messages larger than 4095 bytes will get truncated to 4095 bytes. 13 | */ 14 | 15 | #include "error.hpp" 16 | #include 17 | #include 18 | 19 | static char error_buf_[8192] = {0}; 20 | static size_t error_offset_ = 0; 21 | static size_t avail_ = 0; 22 | static char print_buf_[4096] = {0}; 23 | 24 | // note - log messages larger than 4095 bytes will get truncated to 4095. 25 | 26 | void log_context(const char* file, unsigned line, const char* fmt, ...) 27 | { 28 | va_list ap; 29 | int n = 0; 30 | 31 | va_start(ap, fmt); 32 | n = vsnprintf(print_buf_, 4096, fmt, ap); 33 | va_end(ap); 34 | 35 | if (n > 4095) n = 4095; 36 | 37 | // replace the nul terminator with a new line 38 | 39 | print_buf_[n] = '\n'; 40 | 41 | memcpy(error_buf_ + error_offset_, print_buf_, n + 1); 42 | memcpy(error_buf_ + error_offset_ + 4096, print_buf_, n + 1); 43 | 44 | error_offset_ += n+1; 45 | error_offset_ = error_offset_ % 4096; 46 | 47 | avail_ += (n+1); 48 | 49 | if (avail_ > 4096) 50 | { 51 | avail_ = 4096; 52 | } 53 | } 54 | 55 | static void log_dump() 56 | { 57 | const char* iter = error_buf_ + error_offset_ + (4096 - avail_); 58 | fprintf(stderr, "%s", iter); 59 | } 60 | 61 | void fatal_error(const char* file, unsigned line, const char* fmt, ...) 62 | { 63 | va_list ap; 64 | fprintf(stderr, "%s:%d: error: ", file, line); 65 | 66 | va_start(ap, fmt); 67 | vfprintf(stderr, fmt, ap); 68 | va_end(ap); 69 | 70 | fprintf(stderr, "\n"); 71 | 72 | if (avail_) 73 | { 74 | fprintf(stderr, "context:\n"); 75 | log_dump(); 76 | } 77 | 78 | exit(1); 79 | } 80 | -------------------------------------------------------------------------------- /geoloc/connector.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This module contains the pipeline framework core classes. 9 | * 10 | * The basic idea is similar to a unix pipeline, allowing the user to connect 11 | * Connectors like so: 12 | * 13 | * a | b | c 14 | * 15 | * A Connector is analogous to a unix filter, and a Buffer is analogous to a 16 | * line of text. 17 | */ 18 | 19 | #ifndef CONNECTOR_HPP_576AAD9D 20 | #define CONNECTOR_HPP_576AAD9D 21 | 22 | #include "macros.hpp" 23 | 24 | class Buffer 25 | { 26 | public: 27 | Buffer() 28 | : 29 | data_(0), 30 | n_(0) 31 | { 32 | } 33 | 34 | Buffer(const void* p, size_t n) 35 | { 36 | data_ = p; 37 | n_ = n; 38 | } 39 | 40 | ~Buffer() 41 | { 42 | } 43 | 44 | const void* data() const 45 | { 46 | return data_; 47 | } 48 | 49 | size_t size() const 50 | { 51 | return n_; 52 | } 53 | 54 | // default copy/assign is fine. 55 | 56 | private: 57 | const void *data_; 58 | size_t n_; 59 | }; 60 | 61 | class Connector 62 | { 63 | public: 64 | Connector() 65 | : 66 | downstream_(0) 67 | { 68 | } 69 | 70 | virtual void emit(const Buffer &b) 71 | { 72 | if (!downstream_) 73 | { 74 | return; 75 | } 76 | 77 | downstream_->consume(b); 78 | } 79 | 80 | virtual void emit_flush() 81 | { 82 | if (!downstream_) 83 | { 84 | return; 85 | } 86 | 87 | downstream_->flush(); 88 | } 89 | 90 | virtual void consume(const Buffer &b) = 0; 91 | 92 | virtual void flush() 93 | { 94 | emit_flush(); 95 | } 96 | 97 | virtual Connector& operator|(Connector &c) 98 | { 99 | downstream_ = &c; 100 | return *downstream_; 101 | } 102 | 103 | private: 104 | DISALLOW_COPY_AND_ASSIGN(Connector); 105 | 106 | Connector* downstream_; 107 | }; 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /geoloc/string_table.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This class is used for interning strings. It uses a hash map to track the 9 | * string to id mapping. The layout of indices and the char vector makes it 10 | * easier to serialize later. 11 | */ 12 | 13 | #ifndef STRING_TABLE_HPP_A3ADA5DC 14 | #define STRING_TABLE_HPP_A3ADA5DC 15 | 16 | #include "macros.hpp" 17 | #include "hash_map.hpp" 18 | 19 | class StringTable 20 | { 21 | public: 22 | StringTable() {} 23 | ~StringTable() {} 24 | 25 | size_t size() const 26 | { 27 | return indices_.size(); 28 | } 29 | 30 | size_t byte_size() const 31 | { 32 | return strings_.size(); 33 | } 34 | 35 | void insert(const std::string &s) 36 | { 37 | if (string_to_id_.count(s)) 38 | { 39 | return; 40 | } 41 | 42 | unsigned index = indices_.size(); 43 | string_to_id_[s] = index; 44 | 45 | indices_.push_back(strings_.size()); 46 | 47 | strings_.insert(strings_.end(), s.begin(), s.end()); 48 | strings_.push_back('\0'); 49 | } 50 | 51 | unsigned index_of(const std::string &s) const 52 | { 53 | hash_map::const_iterator iter = 54 | string_to_id_.find(s); 55 | 56 | if (iter == string_to_id_.end()) 57 | { 58 | return 0xFFFFFFFF; 59 | } 60 | 61 | return iter->second; 62 | } 63 | 64 | const char* operator[](size_t i) const 65 | { 66 | return &strings_[indices_[i]]; 67 | } 68 | 69 | const std::vector &indices() const { return indices_; } 70 | const std::vector &strings() const { return strings_; } 71 | 72 | private: 73 | 74 | // default copy/assign would work in this class, but I am making 75 | // it an error because it is unexpected. 76 | 77 | DISALLOW_COPY_AND_ASSIGN(StringTable); 78 | 79 | hash_map string_to_id_; 80 | 81 | std::vector indices_; 82 | std::vector strings_; 83 | }; 84 | 85 | inline void save_string_table(BinaryFile &bf, const StringTable &st) 86 | { 87 | bf.save_pod_vector(st.indices()); 88 | bf.save_pod_vector(st.strings()); 89 | } 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /geoloc/csv.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This file contains utility functions for tokenizing and parsing strings. 9 | */ 10 | 11 | #ifndef CSV_HPP_BE8C5A6D 12 | #define CSV_HPP_BE8C5A6D 13 | 14 | #include 15 | 16 | inline unsigned to_u(const char* s) 17 | { 18 | return strtoul(s, 0, 10); 19 | } 20 | 21 | inline void csv_split(const char* s, 22 | size_t n, 23 | std::string &scratch, 24 | std::vector &toks) 25 | { 26 | toks.clear(); 27 | scratch.clear(); 28 | 29 | scratch.assign(s, n); 30 | 31 | // csv splitter needs to handle quote marks. 32 | 33 | char* ptr = &scratch[0]; 34 | bool first = true; 35 | char* begin = 0; 36 | 37 | while (*ptr) 38 | { 39 | if (!first) 40 | { 41 | *ptr = '\0'; 42 | ++ptr; 43 | } 44 | else 45 | { 46 | first = false; 47 | } 48 | 49 | if (*ptr != '"') 50 | { 51 | begin = ptr; 52 | while (*ptr && *ptr != ',') ++ptr; 53 | toks.push_back(begin); 54 | } 55 | else 56 | { 57 | // skip quote 58 | ++ptr; 59 | begin = ptr; 60 | 61 | while (*ptr && *ptr != '"') ++ptr; 62 | 63 | if (*ptr == '"') 64 | { 65 | *ptr = '\0'; 66 | ++ptr; 67 | } 68 | 69 | toks.push_back(begin); 70 | } 71 | } 72 | } 73 | 74 | inline void char_split(const std::string &s, 75 | std::string &scratch, 76 | std::vector &toks, 77 | char delim) 78 | { 79 | toks.clear(); 80 | scratch.clear(); 81 | 82 | scratch.assign(s.begin(), s.end()); 83 | 84 | char* ptr = &scratch[0]; 85 | bool first = true; 86 | 87 | while (*ptr) 88 | { 89 | if (!first) 90 | { 91 | first = false; 92 | *ptr = '\0'; 93 | ++ptr; 94 | } 95 | else 96 | { 97 | first = false; 98 | } 99 | 100 | char* begin = ptr; 101 | while (*ptr && *ptr != delim) ++ptr; 102 | 103 | toks.push_back(begin); 104 | } 105 | } 106 | 107 | #endif 108 | -------------------------------------------------------------------------------- /geoloc/blocks.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This module handles representations of IP ranges, both normal and packed 9 | * formats. 10 | * 11 | * A Block is an ip range, and an index into another structure. 12 | */ 13 | 14 | #ifndef BLOCKS_HPP_57764690 15 | #define BLOCKS_HPP_57764690 16 | 17 | #include "error.hpp" 18 | #include "serialization.hpp" 19 | #include "connector.hpp" 20 | #include "csv.hpp" 21 | 22 | struct Block 23 | { 24 | unsigned start_ip; 25 | unsigned end_ip; 26 | unsigned loc; 27 | }; 28 | 29 | class BlockTable 30 | { 31 | public: 32 | void load(MemoryFile& file) 33 | { 34 | LOG_CONTEXT("BlockTable load"); 35 | 36 | file.load_mapped_vector(start_ip); 37 | file.load_mapped_vector(end_ip); 38 | file.load_mapped_vector(loc); 39 | } 40 | 41 | MappedVector start_ip; 42 | MappedVector end_ip; 43 | MappedVector loc; 44 | 45 | // default copy/assign is fine 46 | }; 47 | 48 | class BlockParser : public Connector 49 | { 50 | public: 51 | BlockParser() 52 | : 53 | scratch_(), 54 | toks_(), 55 | line_(0) 56 | { 57 | } 58 | 59 | void consume(const Buffer &b) 60 | { 61 | line_++; 62 | if (line_ < 3) return; 63 | 64 | csv_split((const char*) b.data(), b.size(), scratch_, toks_); 65 | 66 | if (toks_.size() != 3) 67 | { 68 | // we just silently drop bad lines 69 | return; 70 | } 71 | 72 | Block block; 73 | 74 | block.start_ip = to_u(toks_[0]); 75 | block.end_ip = to_u(toks_[1]); 76 | block.loc = to_u(toks_[2]); 77 | 78 | emit(Buffer(&block, sizeof(block))); 79 | } 80 | 81 | std::string scratch_; 82 | std::vector toks_; 83 | 84 | size_t line_; 85 | }; 86 | 87 | inline bool save_blocks(BinaryFile &file, const std::vector &v) 88 | { 89 | std::vector start_ip; 90 | std::vector end_ip; 91 | std::vector loc; 92 | 93 | start_ip.resize(v.size()); 94 | end_ip.resize(v.size()); 95 | loc.resize(v.size()); 96 | 97 | unsigned last = 0; 98 | 99 | // split into columns and check sortedness 100 | 101 | for (size_t i = 0; i < v.size(); ++i) 102 | { 103 | assert(v[i].start_ip > last); 104 | assert(v[i].end_ip >= v[i].start_ip); 105 | 106 | start_ip[i] = v[i].start_ip; 107 | end_ip[i] = v[i].end_ip; 108 | loc[i] = v[i].loc; 109 | 110 | last = v[i].end_ip; 111 | } 112 | 113 | file.save_pod_vector(start_ip); 114 | file.save_pod_vector(end_ip); 115 | file.save_pod_vector(loc); 116 | 117 | return true; 118 | } 119 | 120 | #endif 121 | -------------------------------------------------------------------------------- /geoloc/etl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This module contains helper functions to extract, transform and load a 9 | * MaxMind csv dataset. 10 | */ 11 | 12 | #ifndef ETL_HPP_89C643E4 13 | #define ETL_HPP_89C643E4 14 | 15 | #include "serialization.hpp" 16 | #include "pipeline.hpp" 17 | #include "locations.hpp" 18 | #include "blocks.hpp" 19 | #include "asns.hpp" 20 | 21 | inline void build_locations(BinaryFile &file, const char* source) 22 | { 23 | LOG_CONTEXT("build_locations from %s", source); 24 | 25 | FileReader reader(source); 26 | LocationParser parser; 27 | 28 | std::vector locations; 29 | Collector collector(locations); 30 | 31 | reader | parser | collector; 32 | reader.produce(); 33 | 34 | save_locations(file, locations); 35 | } 36 | 37 | inline void build_blocks(BinaryFile &file, const char* source) 38 | { 39 | LOG_CONTEXT("build_blocks from %s", source); 40 | 41 | FileReader reader(source); 42 | BlockParser parser; 43 | 44 | std::vector blocks; 45 | Collector collector(blocks); 46 | 47 | reader | parser | collector; 48 | reader.produce(); 49 | 50 | save_blocks(file, blocks); 51 | } 52 | 53 | inline void build_asns(BinaryFile &file, const char* source) 54 | { 55 | LOG_CONTEXT("build_asns from %s", source); 56 | 57 | FileReader reader(source); 58 | ASNParser parser; 59 | 60 | std::vector asns; 61 | Collector collector(asns); 62 | 63 | reader | parser | collector; 64 | reader.produce(); 65 | 66 | save_asns(file, asns); 67 | } 68 | 69 | inline void build_geo_data(BinaryFile &file, 70 | const char* city_blocks, 71 | const char* city_locs, 72 | const char* geo_asns) 73 | { 74 | build_blocks(file, city_blocks); 75 | build_locations(file, city_locs); 76 | build_asns(file, geo_asns); 77 | } 78 | 79 | inline void get_header(char* buf, size_t n) 80 | { 81 | REL_ASSERT(n > 0); 82 | 83 | memset(buf, '-', n); 84 | 85 | int np = snprintf(buf, n, "geoloc loadzero v001 %s ", get_endian()); 86 | 87 | REL_ASSERT(np >= 0); 88 | 89 | if (np > (int) (n-1)) 90 | { 91 | np = n-1; 92 | } 93 | 94 | buf[np] = '-'; 95 | buf[n-1] = '\n'; 96 | } 97 | 98 | inline void etl(const char* city_blocks, 99 | const char* city_locs, 100 | const char* geo_asns, 101 | const char* output) 102 | { 103 | LOG_CONTEXT("etl blocks %s locs %s asns %s into file %s", 104 | city_blocks, 105 | city_locs, 106 | geo_asns, 107 | output); 108 | 109 | BinaryFile file; 110 | bool ok = file.open(output); 111 | 112 | if (!ok) 113 | { 114 | FATAL_ERROR("could not open %s for writing", output); 115 | } 116 | 117 | char buf[32]; get_header(buf, sizeof(buf)); 118 | 119 | file.save_bytes_raw(buf, sizeof(buf)); 120 | build_geo_data(file, city_blocks, city_locs, geo_asns); 121 | } 122 | 123 | #endif 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Warning: This project is unmaintained (due to MaxMind API deprecation) and currently uses an old database from 2017 3 | 4 | geoloc is a command line tool for bulk geolocation queries written in C++. 5 | Once its binary database has been built, ```geoloc``` performs geolocation 6 | queries offline. 7 | 8 | Examples: 9 | 10 | Bulk lookup from apache `access.log`: 11 | 12 | ``` 13 | $ cat access.log | awk '{print $1}' | geoloc -f - | column -t 14 | 15 | 10.172.47.117 AU 02 Sydney -33.8001 151.3123 AS1610581 BIGCableCo 16 | 10.36.87.70 AU 07 Melbourne -37.8266 144.7834 AS1370775 Micronode+PTY+LTD 17 | 10.88.81.165 US CA San+Francisco 37.6777 -122.2221 AS49335653 Big+Flare,+Inc 18 | ``` 19 | 20 | Query some IPs: 21 | 22 | ``` 23 | $ geoloc -q 8.8.8.8 192.30.252.131 --headers | column -t 24 | 25 | ip country region city latitude longitude as_num as_text 26 | 8.8.8.8 US CA Mountain+View 37.3860 -122.0838 AS15169 Google+Inc. 27 | 192.30.252.131 US CA San+Francisco 37.7697 -122.3933 AS36459 GitHub,+Inc. 28 | ``` 29 | 30 | ```geoloc``` is designed to run fast and load fast: 31 | 32 | ``` 33 | $ wc -l /tmp/ip_list 34 | 35 | 1000000 /tmp/ip_list 36 | 37 | $ time geoloc -f /tmp/ip_list > /tmp/res1 38 | 39 | real 0m6.131s 40 | user 0m5.662s 41 | sys 0m0.369s 42 | 43 | $ time geoloc -q 8.8.8.8 192.30.252.131 > /tmp/res2 44 | 45 | real 0m0.010s 46 | user 0m0.002s 47 | sys 0m0.005s 48 | 49 | ``` 50 | 51 | Installation 52 | ============ 53 | 54 | The program is designed as a portable application, to run out of ```~/bin```, 55 | with the database stored in ```~/var/db/geoloc/geodata.bin```. 56 | 57 | To install: 58 | 59 | ``` 60 | $ git clone https://github.com/loadzero/geoloc.git && cd geoloc 61 | $ ./configure 62 | $ make 63 | $ make install 64 | ``` 65 | 66 | The configure script will check for these dependencies: 67 | 68 | - iconv 69 | - unzip 70 | - wget 71 | - make 72 | - c++ 73 | 74 | During installation, data will be downloaded from 75 | [MaxMind](http://dev.maxmind.com/geoip/legacy/geolite/) to create the database. 76 | 77 | An update script will be installed into ```~/bin/_geoloc_update.sh```. Run 78 | this script when you would like to update your geolocation database. MaxMind 79 | updates their source data once a month. 80 | 81 | I have tested on OSX 10.9.5 and Ubuntu 14.04. Other unices are likely to work 82 | with minimal or no changes. It is unlikely to work on windows, due to the use 83 | of [mmap](http://en.wikipedia.org/wiki/Mmap). 84 | 85 | Design and Implementation 86 | ========================= 87 | 88 | The code operates in two phases, packing and query. The packing phase is all 89 | about converting the data into a machine optimal format, namely relocatable 90 | sorted vectors. The query phase simply mmaps that data, and performs a 91 | std::upper\_bound binary search on it to find the IPs. 92 | 93 | There is an outline of the code, roughly in topological order 94 | [here](outline.md), that contains a summary of each module. 95 | 96 | Attribution 97 | =========== 98 | 99 | This software includes GeoLite data created by MaxMind available from 100 | [http://www.maxmind.com](http://www.maxmind.com) 101 | -------------------------------------------------------------------------------- /geoloc/test.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-07 7 | * 8 | * This file contains test code for geoloc. It is mostly serialization tests. 9 | */ 10 | 11 | #include "serialization.hpp" 12 | #include "string_table.hpp" 13 | 14 | #include 15 | #include 16 | 17 | struct Poddable 18 | { 19 | int a; 20 | int b; 21 | int c; 22 | 23 | bool operator==(const Poddable &o) const 24 | { 25 | return memcmp(this, &o, sizeof(o)) == 0; 26 | } 27 | }; 28 | 29 | static int test_poddable_roundtrip(); 30 | static int test_string_table_roundtrip(); 31 | 32 | int main(int argc, char** argv) 33 | { 34 | // serialization tests 35 | 36 | test_poddable_roundtrip(); 37 | test_string_table_roundtrip(); 38 | } 39 | 40 | static int test_poddable_roundtrip() 41 | { 42 | std::vector bar_vec; 43 | 44 | { 45 | for (size_t i = 0; i < 100; ++i) 46 | { 47 | Poddable bar; 48 | bar.a = 100 + i; 49 | bar.b = 1000 + i; 50 | bar.c = 10000 + i; 51 | 52 | bar_vec.push_back(bar); 53 | } 54 | 55 | BinaryFile bf; 56 | 57 | bf.open("tmp/foo.bin"); 58 | bf.save_pod_vector(bar_vec); 59 | } 60 | 61 | MemoryFile mf; 62 | mf.open("tmp/foo.bin"); 63 | 64 | MappedVector foo; 65 | mf.load_mapped_vector(foo); 66 | 67 | assert(foo.size() != 0); 68 | assert(foo.size() == bar_vec.size()); 69 | 70 | for (size_t i = 0; i < bar_vec.size(); ++i) 71 | { 72 | assert(foo[i] == bar_vec[i]); 73 | } 74 | 75 | return 0; 76 | } 77 | 78 | static int test_string_table_roundtrip() 79 | { 80 | { 81 | StringTable st; 82 | 83 | st.insert("aaaa"); 84 | st.insert("aaaa"); 85 | st.insert("aaaa"); 86 | 87 | assert(st.size() == 1); 88 | assert(st.byte_size() == 5); 89 | assert(st.index_of("aaaa") == 0); 90 | 91 | assert(strcmp(st[0], "aaaa") == 0); 92 | 93 | st.insert("bbb"); 94 | 95 | assert(st.size() == 2); 96 | assert(st.byte_size() == 9); 97 | 98 | assert(st.index_of("bbb") == 1); 99 | assert(strcmp(st[1], "bbb") == 0); 100 | 101 | BinaryFile bf; 102 | bf.open("tmp/sv.bin"); 103 | 104 | for (size_t i = 0; i < 1000; ++i) 105 | { 106 | char buf3[1024]; 107 | 108 | sprintf(buf3, "funky str %d %d %d\n", (int) i, (int) i * 1234, 109 | (int) i * 123456); 110 | st.insert(buf3); 111 | } 112 | 113 | save_string_table(bf, st); 114 | } 115 | 116 | MemoryFile mf; 117 | mf.open("tmp/sv.bin"); 118 | 119 | MappedStringVector mv; 120 | mf.load_mapped_string_vector(mv); 121 | 122 | assert(mv.size() == 1002); 123 | 124 | assert(strcmp(mv[0], "aaaa") == 0); 125 | assert(strcmp(mv[1], "bbb") == 0); 126 | 127 | for (size_t i = 0; i < 1000; ++i) 128 | { 129 | char buf3[1024]; 130 | sprintf(buf3, "funky str %d %d %d\n", (int) i, (int) i * 1234, 131 | (int) i * 123456); 132 | assert(strcmp(mv[i+2], buf3) == 0); 133 | } 134 | 135 | return 0; 136 | } 137 | -------------------------------------------------------------------------------- /geoloc/pipeline.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This module contains some pipeline framework utility classes. They are used 9 | * to input data into the pipelines. Analogous to cat or echo. 10 | */ 11 | 12 | #ifndef PIPELINE_HPP_0D24961E 13 | #define PIPELINE_HPP_0D24961E 14 | 15 | #include 16 | #include 17 | 18 | #include "connector.hpp" 19 | 20 | class FileReader : public Connector 21 | { 22 | public: 23 | explicit FileReader(const std::string &fn) 24 | : 25 | buffer_(0), 26 | cap_(0), 27 | file_(0) 28 | { 29 | if (fn == "-") 30 | { 31 | file_ = stdin; 32 | } 33 | else 34 | { 35 | file_ = fopen(fn.c_str(), "r"); 36 | } 37 | 38 | if (!file_) 39 | { 40 | FATAL_ERROR("could not open %s", fn.c_str()); 41 | } 42 | 43 | if (file_ == 0) 44 | { 45 | fprintf(stderr, "file %s dne\n", fn.c_str()); 46 | exit(1); 47 | } 48 | } 49 | 50 | virtual ~FileReader() 51 | { 52 | if (file_ && file_ != stdin) 53 | { 54 | fclose(file_); 55 | } 56 | 57 | if (buffer_) 58 | { 59 | free(buffer_); 60 | } 61 | } 62 | 63 | void consume(const Buffer &b) {} 64 | 65 | void produce() 66 | { 67 | while (produce_one()); 68 | flush(); 69 | } 70 | 71 | bool produce_one() 72 | { 73 | ssize_t n = getline(&buffer_, &cap_, file_); 74 | 75 | if (n <= 0) 76 | { 77 | return false; 78 | } 79 | 80 | if (buffer_[n-1] == '\n') 81 | { 82 | buffer_[n-1] = '\0'; 83 | n--; 84 | } 85 | 86 | emit(Buffer(buffer_, n)); 87 | 88 | return true; 89 | } 90 | 91 | private: 92 | char* buffer_; 93 | size_t cap_; 94 | FILE* file_; 95 | }; 96 | 97 | template 98 | class Collector : public Connector 99 | { 100 | public: 101 | explicit Collector(std::vector &out) 102 | : 103 | out_(out) 104 | { 105 | } 106 | 107 | void consume(const Buffer &b) 108 | { 109 | const T* item = (const T*)(b.data()); 110 | out_.push_back(*item); 111 | } 112 | 113 | private: 114 | std::vector &out_; 115 | }; 116 | 117 | class StringInjector : public Connector 118 | { 119 | public: 120 | explicit StringInjector(const std::vector &strings) 121 | : 122 | strings_(strings), 123 | index_(0) 124 | { 125 | } 126 | 127 | void consume(const Buffer &b) {} 128 | 129 | void produce() 130 | { 131 | while (produce_one()); 132 | flush(); 133 | } 134 | 135 | bool produce_one() 136 | { 137 | if (index_ == strings_.size()) 138 | { 139 | return false; 140 | } 141 | 142 | const std::string& str = strings_[index_]; 143 | 144 | emit(Buffer(str.c_str(), str.size())); 145 | index_++; 146 | 147 | return true; 148 | } 149 | 150 | private: 151 | 152 | const std::vector &strings_; 153 | size_t index_; 154 | }; 155 | 156 | #endif 157 | -------------------------------------------------------------------------------- /geoloc/asns.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This module handles representations of ASN data, both normal and packed 9 | * formats. 10 | * 11 | * An ASN describes an ip range, autonomous system number, and some text 12 | * describing the system. 13 | */ 14 | 15 | #ifndef ASNS_HPP_5623F7C7 16 | #define ASNS_HPP_5623F7C7 17 | 18 | #include "serialization.hpp" 19 | #include "csv.hpp" 20 | #include "blocks.hpp" 21 | #include "hash_map.hpp" 22 | 23 | struct ASN 24 | { 25 | unsigned start_ip; 26 | unsigned end_ip; 27 | unsigned number; 28 | 29 | std::string text; 30 | }; 31 | 32 | struct PackedASN 33 | { 34 | unsigned number; 35 | unsigned text; 36 | }; 37 | 38 | class ASNTable 39 | { 40 | public: 41 | void load(MemoryFile& file) 42 | { 43 | file.load_mapped_string_vector(text); 44 | file.load_mapped_vector(asns); 45 | } 46 | 47 | MappedStringVector text; 48 | MappedVector asns; 49 | }; 50 | 51 | class ASNParser : public Connector 52 | { 53 | public: 54 | ASNParser() 55 | : 56 | line_(0) 57 | { 58 | } 59 | 60 | void parse_text(char* str, unsigned &num, std::string &txt) 61 | { 62 | scratch_ = str; 63 | 64 | char* iter = &scratch_[0]; 65 | char* end = iter + scratch_.size(); 66 | 67 | char* sp = strchr(iter, ' '); 68 | 69 | const char* tok0 = iter; 70 | const char* tok1 = sp ? sp + 1 : end; 71 | 72 | REL_ASSERT(strlen(tok0) > 2); 73 | 74 | num = to_u(tok0 + 2); 75 | txt = tok1; 76 | } 77 | 78 | void consume(const Buffer &b) 79 | { 80 | ++line_; 81 | 82 | csv_split((const char*) b.data(), b.size(), scratch_, toks); 83 | 84 | if (toks.size() != 3) 85 | { 86 | return; 87 | } 88 | 89 | ASN asn; 90 | 91 | asn.start_ip = to_u(toks[0]); 92 | asn.end_ip = to_u(toks[1]); 93 | parse_text(toks[2], asn.number, asn.text); 94 | 95 | emit(Buffer(&asn, sizeof(asn))); 96 | } 97 | 98 | private: 99 | 100 | std::string str_; 101 | std::string scratch_; 102 | std::vector toks; 103 | 104 | size_t line_; 105 | }; 106 | 107 | // split the asn data into two tables 108 | // a block table and an info table 109 | inline void save_asns(BinaryFile &file, const std::vector &asns) 110 | { 111 | hash_map asn_to_idx; 112 | std::vector packed_asns; 113 | 114 | StringTable text; 115 | 116 | for (size_t i = 0; i < asns.size(); ++i) 117 | { 118 | const ASN &asn = asns[i]; 119 | 120 | if (asn_to_idx.count(asn.number)) 121 | { 122 | continue; 123 | } 124 | 125 | asn_to_idx[asn.number] = packed_asns.size(); 126 | text.insert(asn.text); 127 | 128 | PackedASN pasn; 129 | 130 | pasn.number = asn.number; 131 | pasn.text = text.index_of(asn.text); 132 | 133 | asn_to_idx[asn.number] = packed_asns.size(); 134 | packed_asns.push_back(pasn); 135 | } 136 | 137 | std::vector asn_blocks(asns.size()); 138 | 139 | unsigned last = 0; 140 | 141 | for (size_t i = 0; i < asns.size(); ++i) 142 | { 143 | Block &block = asn_blocks[i]; 144 | const ASN &asn = asns[i]; 145 | 146 | REL_ASSERT(asn.start_ip > last); 147 | REL_ASSERT(asn.end_ip >= asn.start_ip); 148 | 149 | block.start_ip = asn.start_ip; 150 | block.end_ip = asn.end_ip; 151 | block.loc = asn_to_idx[asn.number]; 152 | 153 | last = asn.end_ip; 154 | } 155 | 156 | save_blocks(file, asn_blocks); 157 | save_string_table(file, text); 158 | file.save_pod_vector(packed_asns); 159 | } 160 | 161 | #endif 162 | -------------------------------------------------------------------------------- /outline.md: -------------------------------------------------------------------------------- 1 | geoloc/args.hpp 2 | -------------------------- 3 | 4 | This file contains a command line argument helper class. 5 | 6 | geoloc/error.hpp 7 | -------------------------- 8 | 9 | This file contains the declarations needed to support logging and error 10 | handling. 11 | 12 | geoloc/error.cpp 13 | -------------------------- 14 | 15 | This file contains the logging and error handling implementation. We use a 16 | mirrored ring buffer to handle log messages. The ring buffer is dumped out to 17 | stderr when an assert or fatal error fires. 18 | 19 | Note - log messages larger than 4095 bytes will get truncated to 4095 bytes. 20 | 21 | geoloc/csv.hpp 22 | -------------------------- 23 | 24 | This file contains utility functions for tokenizing and parsing strings. 25 | 26 | geoloc/macros.hpp 27 | -------------------------- 28 | 29 | This file contains macros that are commonly used throughout the source. 30 | 31 | geoloc/hash\_map.hpp 32 | -------------------------- 33 | 34 | This file contains macros to paper over differences between older/newer 35 | compilers as far as using std::unordered_map is concerned. 36 | 37 | geoloc/connector.hpp 38 | -------------------------- 39 | 40 | This module contains the pipeline framework core classes. 41 | 42 | The basic idea is similar to a unix pipeline, allowing the user to connect 43 | Connectors like so: 44 | 45 | a | b | c 46 | 47 | A Connector is analogous to a unix filter, and a Buffer is analogous to a line 48 | of text. 49 | 50 | geoloc/string\_table.hpp 51 | -------------------------- 52 | 53 | This class is used for interning strings. It uses a hash map to track the 54 | string to id mapping. 55 | The layout of indices and the char vector makes it easier to serialize later. 56 | 57 | geoloc/serialization.hpp 58 | -------------------------- 59 | 60 | This module contains classes for saving data into binary files, and loading it 61 | back in from memory maps. 62 | 63 | geoloc/pipeline.hpp 64 | -------------------------- 65 | 66 | This module contains some pipeline framework utility classes. They are used to 67 | input data into the pipelines. Analogous to cat or echo. 68 | 69 | geoloc/locations.hpp 70 | -------------------------- 71 | 72 | This module handles representations of location data, both normal and packed 73 | formats. 74 | 75 | A Location stores an id and positional information from the MaxMind dataset, 76 | namely: 77 | 78 | country, region, city, latitude, longitude 79 | 80 | geoloc/asns.hpp 81 | -------------------------- 82 | 83 | This module handles representations of ASN data, both normal and packed 84 | formats. 85 | 86 | An ASN describes an ip range, [autonomous system 87 | number](http://en.wikipedia.org/wiki/Autonomous_system_%28Internet%29), and 88 | some text describing the system. 89 | 90 | geoloc/blocks.hpp 91 | -------------------------- 92 | 93 | This module handles representations of IP ranges, both normal and packed 94 | formats. 95 | 96 | A Block is an ip range, and an index into another structure. 97 | 98 | geoloc/etl.hpp 99 | -------------------------- 100 | 101 | This module contains helper functions to extract, transform and load a MaxMind 102 | csv dataset. 103 | 104 | geoloc/query.hpp 105 | -------------------------- 106 | 107 | This module handles the query phase of geoloc. It is reponsible for loading 108 | the geodata file, and providing interfaces for querying it. 109 | 110 | The main part of the query code uses a binary search (std::upper\_bound) 111 | against a set of memory mapped sorted vectors. 112 | 113 | geoloc/geoloc.cpp 114 | -------------------------- 115 | 116 | ```geoloc``` is a command line application for retrieving MaxMind geolocation 117 | and ASN info for a set of IP addresses. 118 | 119 | The default output format is one record per line, space separated, with no 120 | headers. The columns output are as follows: 121 | 122 | ip, country, region, city, latitude, longitude, as_num, as\_text 123 | 124 | geoloc/test.cpp 125 | -------------------------- 126 | 127 | This file contains test code for geoloc. It is mostly serialization tests. 128 | 129 | -------------------------------------------------------------------------------- /geoloc/locations.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This module handles representations of location data, both normal and packed 9 | * formats. 10 | * 11 | * A Location stores an id and positional information from the MaxMind dataset, 12 | * namely: 13 | * 14 | * country, region, city, latitude, longitude 15 | */ 16 | 17 | #ifndef LOCATIONS_HPP_CCAC801C 18 | #define LOCATIONS_HPP_CCAC801C 19 | 20 | #include "string_table.hpp" 21 | #include "error.hpp" 22 | #include "csv.hpp" 23 | 24 | struct Location 25 | { 26 | Location() 27 | : 28 | id(0) 29 | { 30 | } 31 | 32 | unsigned id; 33 | std::string country; 34 | std::string region; 35 | std::string city; 36 | std::string lat; 37 | std::string lon; 38 | }; 39 | 40 | struct PackedLocation 41 | { 42 | PackedLocation() 43 | : 44 | id(0), 45 | country(0), 46 | region(0), 47 | city(0), 48 | lat(0.0), 49 | lon(0.0) 50 | { 51 | } 52 | 53 | unsigned id; 54 | unsigned country; 55 | unsigned region; 56 | unsigned city; 57 | 58 | float lat; 59 | float lon; 60 | }; 61 | 62 | class LocationTable 63 | { 64 | public: 65 | void load(MemoryFile& file) 66 | { 67 | file.load_mapped_string_vector(country); 68 | file.load_mapped_string_vector(region); 69 | file.load_mapped_string_vector(city); 70 | file.load_mapped_vector(locations); 71 | } 72 | 73 | void dump() 74 | { 75 | printf("loc size %d\n", (int) locations.size()); 76 | 77 | for (size_t i = 0; i < locations.size(); ++i) 78 | { 79 | const PackedLocation &loc = locations[i]; 80 | 81 | printf("%d %s %s %s\n", loc.id, 82 | country[loc.country], 83 | region[loc.region], 84 | city[loc.city]); 85 | } 86 | } 87 | 88 | // default copy/assign ok 89 | 90 | MappedStringVector country; 91 | MappedStringVector region; 92 | MappedStringVector city; 93 | MappedVector locations; 94 | }; 95 | 96 | class LocationParser : public Connector 97 | { 98 | public: 99 | LocationParser() 100 | : 101 | str_(), 102 | scratch_(), 103 | toks_(), 104 | line_(0) 105 | { 106 | } 107 | 108 | void consume(const Buffer &b) 109 | { 110 | line_++; 111 | if (line_ < 3) return; 112 | 113 | csv_split((const char*) b.data(), b.size(), scratch_, toks_); 114 | 115 | if (toks_.size() != 9) 116 | { 117 | return; 118 | } 119 | 120 | Location loc; 121 | 122 | loc.id = to_u(toks_[0]); 123 | loc.country = toks_[1]; 124 | loc.region = toks_[2]; 125 | loc.city = toks_[3]; 126 | loc.lat = toks_[5]; 127 | loc.lon = toks_[6]; 128 | 129 | emit(Buffer(&loc, sizeof(loc))); 130 | } 131 | 132 | private: 133 | std::string str_; 134 | std::string scratch_; 135 | std::vector toks_; 136 | 137 | size_t line_; 138 | }; 139 | 140 | // convert string columns into string tables 141 | // turn location into packed location 142 | inline void save_locations(BinaryFile &file, 143 | const std::vector &locations) 144 | { 145 | unsigned maxid = 0; 146 | 147 | StringTable country; 148 | StringTable region; 149 | StringTable city; 150 | 151 | for (size_t i = 0; i < locations.size(); ++i) 152 | { 153 | const Location &loc = locations[i]; 154 | 155 | maxid = std::max(loc.id, maxid); 156 | 157 | country.insert(loc.country); 158 | region.insert(loc.region); 159 | city.insert(loc.city); 160 | } 161 | 162 | save_string_table(file, country); 163 | save_string_table(file, region); 164 | save_string_table(file, city); 165 | 166 | std::vector packed; 167 | packed.resize(maxid + 1); 168 | 169 | for (size_t i = 0; i < locations.size(); ++i) 170 | { 171 | const Location &loc = locations[i]; 172 | 173 | PackedLocation &ploc = packed[loc.id]; 174 | 175 | ploc.id = loc.id; 176 | ploc.country = country.index_of(loc.country); 177 | ploc.region = region.index_of(loc.region); 178 | ploc.city = city.index_of(loc.city); 179 | ploc.lat = strtof(loc.lat.c_str(), 0); 180 | ploc.lon = strtof(loc.lon.c_str(), 0); 181 | } 182 | 183 | file.save_pod_vector(packed); 184 | } 185 | 186 | #endif 187 | -------------------------------------------------------------------------------- /geoloc/geoloc.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-06 7 | * 8 | * geoloc is a command line application for retrieving MaxMind geolocation and 9 | * ASN info for a set of IP addresses. 10 | * 11 | * The default output format is one record per line, space separated, with no 12 | * headers. The columns output are as follows: 13 | * 14 | * ip, country, region, city, latitude, longitude, as_num, as_text 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | #include "etl.hpp" 21 | #include "query.hpp" 22 | #include "error.hpp" 23 | #include "args.hpp" 24 | 25 | #include 26 | 27 | static void usage(const char* condition) 28 | { 29 | if (condition) 30 | { 31 | fprintf(stderr, "%s\n", condition); 32 | } 33 | 34 | fprintf(stderr, "usage:"); 35 | fprintf(stderr, "\tgeoloc -f file ... [--headers]\n"); 36 | fprintf(stderr, "\tgeoloc -q ip ...\n"); 37 | fprintf(stderr, "\n"); 38 | fprintf(stderr, "This software includes GeoLite data created by MaxMind\n"); 39 | fprintf(stderr, "available from http://www.maxmind.com\n"); 40 | 41 | exit(1); 42 | } 43 | 44 | static std::string default_file() 45 | { 46 | const char* home_env = getenv("HOME"); 47 | std::string home_dir = home_env ? home_env : ""; 48 | 49 | if (home_dir.empty()) 50 | { 51 | FATAL_ERROR("could not get home dir"); 52 | } 53 | 54 | return home_dir + "/var/db/geoloc/geodata.bin"; 55 | } 56 | 57 | int main(int argc, char** argv) 58 | { 59 | if (argc < 2) 60 | { 61 | usage(0); 62 | } 63 | 64 | Args args(argc, argv); 65 | 66 | std::set flags; 67 | 68 | flags.insert("-f"); 69 | flags.insert("--import"); 70 | flags.insert("--headers"); 71 | flags.insert("-q"); 72 | flags.insert("-o"); 73 | 74 | std::vector input_list; 75 | std::string import; 76 | std::string output; 77 | 78 | std::string data_file_name = default_file(); 79 | bool show_headers = false; 80 | 81 | while (!args.empty()) 82 | { 83 | if (strcmp(args.peek(), "-f") == 0) 84 | { 85 | args.pop(); 86 | 87 | while (true) 88 | { 89 | const char* fn = args.pop(); 90 | 91 | if (!fn) 92 | { 93 | usage("empty file arg"); 94 | } 95 | 96 | input_list.push_back(std::string("file:") + fn); 97 | 98 | if (args.empty() || flags.count(args.peek())) 99 | break; 100 | } 101 | } 102 | else if (strcmp(args.peek(), "--headers") == 0) 103 | { 104 | show_headers = true; 105 | args.pop(); 106 | } 107 | else if (strcmp(args.peek(), "-o") == 0) 108 | { 109 | args.pop(); 110 | 111 | const char* arg = args.pop(); 112 | 113 | if (!arg) 114 | { 115 | usage("empty output arg"); 116 | } 117 | 118 | output = arg; 119 | } 120 | else if (strcmp(args.peek(), "--import") == 0) 121 | { 122 | args.pop(); 123 | 124 | if (!import.empty()) 125 | { 126 | usage("too many import options"); 127 | } 128 | 129 | const char* arg = args.pop(); 130 | 131 | if (!arg) 132 | { 133 | usage("empty import arg"); 134 | } 135 | 136 | import = arg; 137 | } 138 | else if (strcmp(args.peek(), "-q") == 0) 139 | { 140 | args.pop(); 141 | 142 | std::string query = "query:"; 143 | 144 | while (true) 145 | { 146 | const char* arg = args.pop(); 147 | 148 | if (!arg) 149 | { 150 | usage("empty query arg"); 151 | } 152 | 153 | query += arg + std::string(","); 154 | 155 | if (args.empty() || flags.count(args.peek())) 156 | break; 157 | } 158 | 159 | input_list.push_back(query); 160 | } 161 | else 162 | { 163 | fprintf(stderr, "unrecognized option %s\n", args.peek()); 164 | usage(0); 165 | } 166 | } 167 | 168 | if (!import.empty()) 169 | { 170 | if (!input_list.empty()) 171 | { 172 | usage("import and query are mutually exclusive"); 173 | } 174 | 175 | if (output.empty()) 176 | { 177 | usage("no output specified with import"); 178 | } 179 | 180 | std::string city_blocks = import + "/blocks.csv"; 181 | std::string city_locs = import + "/location.csv"; 182 | std::string geo_asns = import + "/asnum.csv"; 183 | 184 | etl(city_blocks.c_str(), city_locs.c_str(), geo_asns.c_str(), 185 | output.c_str()); 186 | } 187 | else 188 | { 189 | if (input_list.empty()) 190 | { 191 | usage("query has no input"); 192 | } 193 | 194 | if (!import.empty()) 195 | { 196 | usage("import and query are mutually exclusive"); 197 | } 198 | 199 | query(data_file_name.c_str(), input_list, show_headers); 200 | } 201 | 202 | return 0; 203 | } 204 | -------------------------------------------------------------------------------- /geoloc/serialization.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This module contains classes for saving data into binary files, and loading 9 | * it back in from memory maps. 10 | */ 11 | 12 | #ifndef SERIALIZATION_HPP_E69107D1 13 | #define SERIALIZATION_HPP_E69107D1 14 | 15 | #include "macros.hpp" 16 | #include "error.hpp" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | template 30 | struct RawMappedVector 31 | { 32 | unsigned size_; 33 | T data_[]; 34 | }; 35 | 36 | template 37 | class MappedVector 38 | { 39 | public: 40 | MappedVector() 41 | : 42 | ptr_(0) 43 | { 44 | } 45 | 46 | ~MappedVector() 47 | { 48 | } 49 | 50 | void init(const RawMappedVector* ptr) 51 | { 52 | ptr_ = ptr; 53 | } 54 | 55 | size_t size() const 56 | { 57 | return ptr_->size_; 58 | } 59 | 60 | const T& operator[](size_t n) const 61 | { 62 | return ptr_->data_[n]; 63 | } 64 | 65 | const T* begin() const 66 | { 67 | return ptr_->data_; 68 | } 69 | 70 | const T* end() const 71 | { 72 | return begin() + size(); 73 | } 74 | 75 | size_t byte_size() const 76 | { 77 | return sizeof(ptr_->size_) + ptr_->size_ * sizeof(T); 78 | } 79 | 80 | // default copy/assign ok 81 | 82 | private: 83 | const RawMappedVector* ptr_; 84 | }; 85 | 86 | class MemoryMap 87 | { 88 | public: 89 | MemoryMap() 90 | : 91 | data_(0), 92 | len_(0) 93 | { 94 | } 95 | 96 | ~MemoryMap() 97 | { 98 | if (data_) 99 | { 100 | int rc = munmap((void*) data_, len_); 101 | UNUSED(rc); 102 | } 103 | } 104 | 105 | bool open(const char* fn) 106 | { 107 | int fd = ::open(fn, O_RDONLY); 108 | 109 | if (fd < 0) 110 | { 111 | return false; 112 | } 113 | 114 | off_t end = lseek(fd, 0, SEEK_END); 115 | 116 | if (end == -1) 117 | { 118 | LOG_CONTEXT("could not seek to end of %s", fn); 119 | return false; 120 | } 121 | 122 | off_t beg = lseek(fd, 0, SEEK_SET); 123 | 124 | if (beg == -1) 125 | { 126 | LOG_CONTEXT("could not seek to beg of %s", fn); 127 | return false; 128 | } 129 | 130 | len_ = end; 131 | data_ = mmap(0, len_, PROT_READ, MAP_SHARED, fd, 0); 132 | 133 | if (!data_) 134 | { 135 | LOG_CONTEXT("could not mmap %s", fn); 136 | close(fd); 137 | return false; 138 | } 139 | 140 | int rc = ::close(fd); 141 | UNUSED(rc); 142 | 143 | return true; 144 | } 145 | 146 | const void* data() const 147 | { 148 | return data_; 149 | } 150 | 151 | const char* begin() const 152 | { 153 | return (const char*) data_; 154 | } 155 | 156 | size_t size() const 157 | { 158 | return len_; 159 | } 160 | 161 | private: 162 | DISALLOW_COPY_AND_ASSIGN(MemoryMap); 163 | 164 | const void *data_; 165 | size_t len_; 166 | }; 167 | 168 | class BinaryFile 169 | { 170 | public: 171 | BinaryFile() 172 | : 173 | file_(0) 174 | { 175 | } 176 | 177 | ~BinaryFile() 178 | { 179 | if (file_) 180 | { 181 | fclose(file_); 182 | } 183 | } 184 | 185 | bool open(const char* fn) 186 | { 187 | file_ = fopen(fn, "w"); 188 | return file_; 189 | } 190 | 191 | off_t offset() 192 | { 193 | return ftello(file_); 194 | } 195 | 196 | void save_type(const char* x) 197 | { 198 | assert(strlen(x) == 4); 199 | size_t rn = fwrite(x, 1, 4, file_); 200 | 201 | if (rn != 4) 202 | { 203 | FATAL_ERROR("failed to save_type 4 bytes"); 204 | } 205 | } 206 | 207 | void save_unsigned(unsigned x) 208 | { 209 | size_t rn = fwrite(&x, 1, 4, file_); 210 | 211 | if (rn != 4) 212 | { 213 | FATAL_ERROR("failed to save_unsigned 4 bytes"); 214 | } 215 | } 216 | 217 | void save_bytes_padded(const void* b, size_t n) 218 | { 219 | size_t rn = fwrite(b, 1, n, file_); 220 | 221 | if (rn != n) 222 | { 223 | FATAL_ERROR("failed to save_bytes_padded %zu bytes", n); 224 | } 225 | 226 | return pad(); 227 | } 228 | 229 | void save_bytes_raw(const void* b, size_t n) 230 | { 231 | size_t rn = fwrite(b, 1, n, file_); 232 | 233 | if (rn != n) 234 | { 235 | FATAL_ERROR("failed to save_bytes_raw %zu bytes", n); 236 | } 237 | } 238 | 239 | void seek(off_t where) 240 | { 241 | int rc = fseeko(file_, where, SEEK_SET); 242 | 243 | if (rc != 0) 244 | { 245 | FATAL_ERROR("failed to seek to %zu rc %d", where, rc); 246 | } 247 | } 248 | 249 | void pad() 250 | { 251 | // given the current offset, emit some padding bytes. 252 | 253 | unsigned padded = ((unsigned) (offset()) + 3) & ~0x3U; 254 | unsigned pad_bytes = padded - offset(); 255 | char padding[] = {0,0,0,0}; 256 | 257 | return save_bytes_raw(padding, pad_bytes); 258 | } 259 | 260 | template 261 | void save_pod_vector(const std::vector &v) 262 | { 263 | save_type("PODV"); 264 | save_unsigned(0); 265 | 266 | off_t top = offset(); 267 | save_unsigned(v.size()); 268 | save_bytes_padded(&v[0], v.size() * sizeof(T)); 269 | 270 | off_t bottom = offset(); 271 | unsigned size = (bottom - top); 272 | 273 | seek(top-4); 274 | save_unsigned(size); 275 | seek(bottom); 276 | } 277 | 278 | private: 279 | DISALLOW_COPY_AND_ASSIGN(BinaryFile); 280 | 281 | FILE* file_; 282 | std::vector offsets_; 283 | }; 284 | 285 | inline bool isaligned(void* ptr) 286 | { 287 | uintptr_t x = (uintptr_t)(ptr); 288 | return x % 4 == 0; 289 | } 290 | 291 | class MappedStringVector 292 | { 293 | public: 294 | MappedStringVector() 295 | : 296 | data_(0), 297 | indices_() 298 | { 299 | } 300 | 301 | size_t size() const 302 | { 303 | return indices_->size_; 304 | } 305 | 306 | const char* operator[](size_t i) const 307 | { 308 | return data_ + indices_->data_[i]; 309 | } 310 | 311 | void init(const char* data, const RawMappedVector *indices) 312 | { 313 | data_ = data; 314 | indices_ = indices; 315 | } 316 | 317 | // default copy/assign ok 318 | 319 | private: 320 | const char* data_; 321 | const RawMappedVector *indices_; 322 | }; 323 | 324 | class MemoryFile 325 | { 326 | public: 327 | MemoryFile() 328 | : 329 | file_(), 330 | offset_(0) 331 | { 332 | } 333 | 334 | size_t avail() 335 | { 336 | return file_.size() - offset_; 337 | } 338 | 339 | bool open(const char* fn) 340 | { 341 | return file_.open(fn); 342 | } 343 | 344 | void* iter() 345 | { 346 | return (void*) (file_.begin() + offset_); 347 | } 348 | 349 | const void* get_mem(size_t n) 350 | { 351 | if (n > avail()) 352 | { 353 | return 0; 354 | } 355 | 356 | void* out = iter(); 357 | offset_ += n; 358 | 359 | return out; 360 | } 361 | 362 | const char* load_type() 363 | { 364 | return (const char*) get_mem(4); 365 | } 366 | 367 | const unsigned* load_unsigned() 368 | { 369 | REL_ASSERT(isaligned(iter())); 370 | return (const unsigned*)(get_mem(4)); 371 | } 372 | 373 | template 374 | const RawMappedVector* load_raw_mapped_vector() 375 | { 376 | const char* type = load_type(); 377 | 378 | if (type == 0) 379 | { 380 | return 0; 381 | } 382 | 383 | if (memcmp(type, "PODV", 4) != 0) 384 | { 385 | return 0; 386 | } 387 | 388 | const unsigned* len = load_unsigned(); 389 | 390 | if (len == 0) 391 | { 392 | return 0; 393 | } 394 | 395 | return (const RawMappedVector*)(get_mem(*len)); 396 | } 397 | 398 | template 399 | void load_mapped_vector(MappedVector &out) 400 | { 401 | const RawMappedVector* foo = load_raw_mapped_vector(); 402 | 403 | if (!foo) 404 | { 405 | FATAL_ERROR("could not load_mapped_vector"); 406 | } 407 | 408 | out.init(foo); 409 | } 410 | 411 | void load_mapped_string_vector(MappedStringVector &out) 412 | { 413 | const RawMappedVector* foo = 414 | load_raw_mapped_vector(); 415 | 416 | if (!foo) 417 | { 418 | FATAL_ERROR("could not load_mapped_string_vector indices"); 419 | } 420 | 421 | const RawMappedVector* str = load_raw_mapped_vector(); 422 | 423 | if (!str) 424 | { 425 | FATAL_ERROR("could not load_mapped_string_vector strings"); 426 | } 427 | 428 | out.init(str->data_, foo); 429 | } 430 | 431 | private: 432 | 433 | DISALLOW_COPY_AND_ASSIGN(MemoryFile); 434 | 435 | MemoryMap file_; 436 | off_t offset_; 437 | }; 438 | 439 | #endif 440 | -------------------------------------------------------------------------------- /geoloc/query.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Jason McSweeney 3 | * Licensed under BSD 3 Clause - see LICENSE 4 | * 5 | * author: Jason McSweeney 6 | * created: 2015-03-11 7 | * 8 | * This module handles the query phase of geoloc. It is reponsible for loading 9 | * the geodata file, and providing interfaces for querying it. 10 | * 11 | * The main part of the query code uses a binary search (std::upper_bound) 12 | * against a set of memory mapped sorted vectors. 13 | */ 14 | 15 | #ifndef QUERY_HPP_EC6CE5A7 16 | #define QUERY_HPP_EC6CE5A7 17 | 18 | #include "macros.hpp" 19 | #include "error.hpp" 20 | #include "blocks.hpp" 21 | #include "locations.hpp" 22 | #include "asns.hpp" 23 | #include "csv.hpp" 24 | #include "pipeline.hpp" 25 | 26 | #include 27 | 28 | struct IPResult 29 | { 30 | IPResult() 31 | : 32 | quad(0), 33 | country(0), 34 | region(0), 35 | city(0), 36 | lat(0), 37 | lon(0), 38 | asn(0), 39 | asn_text(0) 40 | { 41 | } 42 | 43 | unsigned quad; 44 | 45 | const char* country; 46 | const char* region; 47 | const char* city; 48 | 49 | float lat; 50 | float lon; 51 | 52 | const unsigned* asn; 53 | const char* asn_text; 54 | }; 55 | 56 | class GeoData 57 | { 58 | public: 59 | GeoData() {} 60 | ~GeoData() {} 61 | 62 | void open(const char* fn) 63 | { 64 | LOG_CONTEXT("GeoData open %s", fn); 65 | 66 | bool ok = mem_file_.open(fn); 67 | 68 | if (!ok) 69 | { 70 | FATAL_ERROR("could not open %s for reading", fn); 71 | } 72 | 73 | LOG_CONTEXT("GeoData read header"); 74 | read_header(); 75 | 76 | LOG_CONTEXT("GeoData load location_ip_blocks"); 77 | location_ip_blocks_.load(mem_file_); 78 | 79 | LOG_CONTEXT("GeoData load location_data"); 80 | location_data_.load(mem_file_); 81 | 82 | LOG_CONTEXT("GeoData load asn_ip_blocks"); 83 | asn_ip_blocks_.load(mem_file_); 84 | 85 | LOG_CONTEXT("GeoData load asn_data"); 86 | asn_data_.load(mem_file_); 87 | } 88 | 89 | void check_header_value(const char* type, 90 | const char* value, 91 | const char* expected) 92 | { 93 | if (strcmp(value, expected) != 0) 94 | { 95 | FATAL_ERROR("header %s expecting %s got %s", 96 | type, value, expected); 97 | } 98 | } 99 | 100 | void read_header() 101 | { 102 | const char* raw = (const char*) mem_file_.get_mem(32); 103 | 104 | std::string header(raw, raw + 32); 105 | std::string scratch; 106 | std::vector toks; 107 | 108 | char_split(header, scratch, toks, ' '); 109 | 110 | if (toks.size() != 5) 111 | { 112 | FATAL_ERROR("header is corrupt - %s", header.c_str()); 113 | } 114 | 115 | check_header_value("header1", toks[0], "geoloc"); 116 | check_header_value("header2", toks[1], "loadzero"); 117 | check_header_value("version", toks[2], "v001"); 118 | check_header_value("endian", toks[3], get_endian()); 119 | } 120 | 121 | unsigned block_query(const BlockTable &blocks, unsigned quad) const 122 | { 123 | // find first pos compares gt quad 124 | 125 | const unsigned* iter = 126 | std::upper_bound(blocks.start_ip.begin(), 127 | blocks.start_ip.end(), 128 | quad); 129 | 130 | if (iter == blocks.start_ip.begin()) 131 | { 132 | return -1; 133 | } 134 | 135 | unsigned idx = iter - blocks.start_ip.begin(); 136 | unsigned ri = idx - 1; 137 | 138 | if (quad >= blocks.start_ip[ri] && 139 | quad <= blocks.end_ip[ri]) 140 | { 141 | return ri; 142 | } 143 | 144 | return -1; 145 | } 146 | 147 | unsigned location_block_query(unsigned quad) const 148 | { 149 | return block_query(location_ip_blocks_, quad); 150 | } 151 | 152 | void query(unsigned quad, IPResult &result) const 153 | { 154 | result.quad = quad; 155 | 156 | unsigned block_idx = block_query(location_ip_blocks_, quad); 157 | 158 | if (block_idx != -1U) 159 | { 160 | unsigned loc_idx = location_ip_blocks_.loc[block_idx]; 161 | const PackedLocation& loc = location_data_.locations[loc_idx]; 162 | 163 | result.country = location_data_.country[loc.country]; 164 | result.region = location_data_.region[loc.region]; 165 | result.city = location_data_.city[loc.city]; 166 | result.lat = loc.lat; 167 | result.lon = loc.lon; 168 | } 169 | 170 | block_idx = block_query(asn_ip_blocks_, quad); 171 | 172 | if (block_idx != -1U) 173 | { 174 | unsigned asn_idx = asn_ip_blocks_.loc[block_idx]; 175 | const PackedASN& asn = asn_data_.asns[asn_idx]; 176 | 177 | result.asn = &asn.number; 178 | result.asn_text = asn_data_.text[asn.text]; 179 | } 180 | } 181 | 182 | private: 183 | 184 | DISALLOW_COPY_AND_ASSIGN(GeoData); 185 | 186 | MemoryFile mem_file_; 187 | 188 | BlockTable location_ip_blocks_; 189 | LocationTable location_data_; 190 | 191 | BlockTable asn_ip_blocks_; 192 | ASNTable asn_data_; 193 | }; 194 | 195 | inline int ip_to_s(char* out, unsigned quad) 196 | { 197 | int a = (quad >> 24) & 0xff; 198 | int b = (quad >> 16) & 0xff; 199 | int c = (quad >> 8) & 0xff; 200 | int d = (quad) & 0xff; 201 | 202 | return sprintf(out, "%d.%d.%d.%d", a, b, c, d); 203 | } 204 | 205 | // convert dotted quads into unsigned ints 206 | class IPParser : public Connector 207 | { 208 | public: 209 | void consume(const Buffer &b) 210 | { 211 | str_.assign((char*) b.data(), b.size()); 212 | char_split(str_, scratch_, toks, '.'); 213 | 214 | if (toks.size() != 4) 215 | { 216 | return; 217 | } 218 | 219 | unsigned quad = to_u(toks[0]) << 24 | 220 | to_u(toks[1]) << 16 | 221 | to_u(toks[2]) << 8 | 222 | to_u(toks[3]); 223 | 224 | emit(Buffer(&quad, sizeof(quad))); 225 | } 226 | 227 | private: 228 | std::string str_; 229 | std::string scratch_; 230 | std::vector toks; 231 | }; 232 | 233 | class IPScanner: public Connector 234 | { 235 | public: 236 | explicit IPScanner(const GeoData &geo_data) 237 | : 238 | geo_data_(geo_data) 239 | { 240 | } 241 | 242 | void consume(const Buffer &b) 243 | { 244 | unsigned quad = *(unsigned*)(b.data()); 245 | 246 | IPResult result; 247 | geo_data_.query(quad, result); 248 | 249 | emit(Buffer(&result, sizeof(result))); 250 | } 251 | 252 | private: 253 | const GeoData &geo_data_; 254 | }; 255 | 256 | // currently just turns spaces into + 257 | // TODO - escape into percent encoded ASCII. 258 | inline void escape(std::string &out, const char* str) 259 | { 260 | if (!str) 261 | { 262 | out = "%"; 263 | return; 264 | } 265 | 266 | size_t n = strlen(str); 267 | 268 | if (n == 0) 269 | { 270 | out = "%"; 271 | return; 272 | } 273 | 274 | out.resize(n); 275 | 276 | char* out_iter = &out[0]; 277 | const char* in_iter = str; 278 | 279 | while (*in_iter) 280 | { 281 | int outc = (*in_iter == ' ') ? '+' : *in_iter; 282 | *out_iter++ = outc; 283 | 284 | ++in_iter; 285 | } 286 | } 287 | 288 | class IPResultEmitter : public Connector 289 | { 290 | public: 291 | void print_ip(unsigned quad) 292 | { 293 | char buf[32]; 294 | int nb = ip_to_s(buf, quad); 295 | 296 | writes(buf, nb); 297 | } 298 | 299 | void print(const char* s) 300 | { 301 | escape(esc_buf_, s); 302 | 303 | writes(esc_buf_.c_str(), esc_buf_.size()); 304 | } 305 | 306 | void print(float f) 307 | { 308 | char fbuf[32]; 309 | int nb = sprintf(fbuf, "%3.4f", f); 310 | 311 | writes(fbuf, nb); 312 | } 313 | 314 | void print_as(const unsigned *asn) 315 | { 316 | char sbuf[32]; 317 | int nb = 0; 318 | 319 | if (!asn) 320 | { 321 | nb = sprintf(sbuf, "%%"); 322 | } 323 | else 324 | { 325 | nb = sprintf(sbuf, "AS%u", *asn); 326 | } 327 | 328 | writes(sbuf, nb); 329 | } 330 | 331 | static void show_headers() 332 | { 333 | fprintf(stdout, 334 | "ip country region city latitude longitude as_num as_text\n"); 335 | } 336 | 337 | void delimit() 338 | { 339 | writes(" ", 1); 340 | } 341 | 342 | void newline() 343 | { 344 | writes("\n", 1); 345 | } 346 | 347 | void writes(const char* s, size_t n) 348 | { 349 | print_buf_.insert(print_buf_.end(), s, s+n); 350 | } 351 | 352 | void consume(const Buffer &b) 353 | { 354 | const IPResult* result = (const IPResult*)(b.data()); 355 | 356 | print_buf_.clear(); 357 | 358 | print_ip(result->quad); delimit(); 359 | print(result->country); delimit(); 360 | print(result->region); delimit(); 361 | print(result->city); delimit(); 362 | print(result->lat); delimit(); 363 | print(result->lon); delimit(); 364 | print_as(result->asn); delimit(); 365 | print(result->asn_text); 366 | newline(); 367 | 368 | size_t n = fwrite(&print_buf_[0], print_buf_.size(), 1, stdout); 369 | REL_ASSERT(n == 1); 370 | 371 | print_buf_.clear(); 372 | } 373 | 374 | private: 375 | 376 | std::string print_buf_; 377 | std::string esc_buf_; 378 | }; 379 | 380 | template 381 | inline void query(T &reader, GeoData &data) 382 | { 383 | IPParser parser; 384 | IPScanner scanner(data); 385 | 386 | IPResultEmitter emitter; 387 | 388 | reader | parser | scanner | emitter; 389 | reader.produce(); 390 | } 391 | 392 | inline void query(GeoData &data, const std::string &source) 393 | { 394 | LOG_CONTEXT("query data with source %s", source.c_str()); 395 | 396 | const char* iter = &source[0]; 397 | const char* end = &source[0] + source.size(); 398 | const char* res = strchr(iter, ':'); 399 | 400 | if (!res) 401 | { 402 | FATAL_ERROR("could not parse source %s", source.c_str()); 403 | } 404 | 405 | std::string protocol(iter, res); 406 | std::string path(res+1, end); 407 | 408 | if (protocol == "file") 409 | { 410 | FileReader reader(path); 411 | query(reader, data); 412 | } 413 | else if (protocol == "query") 414 | { 415 | std::vector toks; 416 | std::string scratch; 417 | csv_split(&path[0], path.size(), scratch, toks); 418 | 419 | std::vector ip_list; 420 | ip_list.assign(toks.begin(), toks.end()); 421 | 422 | StringInjector reader(ip_list); 423 | query(reader, data); 424 | } 425 | else 426 | { 427 | FATAL_ERROR("unknown source protocol %s", protocol.c_str()); 428 | } 429 | } 430 | 431 | inline void query(const char* data_file_name, 432 | const std::vector &data_sources, 433 | bool show_headers) 434 | { 435 | LOG_CONTEXT("query data %s with %zu sources", data_file_name, data_sources.size()); 436 | 437 | GeoData data; 438 | data.open(data_file_name); 439 | 440 | if (show_headers) 441 | { 442 | IPResultEmitter::show_headers(); 443 | } 444 | 445 | for (size_t i = 0; i < data_sources.size(); ++i) 446 | { 447 | query(data, data_sources[i]); 448 | } 449 | } 450 | 451 | #endif 452 | --------------------------------------------------------------------------------