├── whoisxmlapi_download_whois_data ├── whois_utils │ ├── __init__.py │ ├── whois_user_interaction.py │ └── whois_web_download_utils.py ├── requirements.txt ├── requirements_legacy.txt ├── requirements_windows.txt ├── HOWTO_add_a_new_feed.txt ├── new_generation_plans.dat ├── install_p12.py ├── README_python2.txt ├── CHANGELOG.txt ├── README.SSL.txt ├── FAQ.txt └── SPECIFICATIONS.txt ├── whoisxmlapi_whoisdownload_bash ├── supported_gtlds ├── BUGS ├── install_p12pack.sh ├── README.SSL ├── supported_ngtlds └── README ├── UsingScriptsOnWindows10.pdf ├── legacy_scripts ├── get_whois_info-0.0.4.zip ├── get_whois_info-0.0.5.zip ├── get_whois_info-0.0.6.zip ├── get_whois_info_python-0.0.6.zip └── README ├── whoisxmlapi_bash_csv_to_mysqldb ├── countries ├── CHANGELOG.txt ├── load_csv_file_into_db_old.sh ├── loader_schema_simple.sql ├── loader_schema_regular_daily_only.sql ├── loader_schema_regular.sql ├── loader_schema_full.sql ├── load_csv_file_into_db.sh └── README ├── netblocks_csv_to_mysqldb ├── NetblocksRDB_Diagram.png ├── load_netblocks_data_to_mysql.odt ├── load_netblocks_data_to_mysql.pdf └── README.md ├── website_contactscats_to_mysqldb ├── website_categories_schema.png ├── website_contats_categories_schema.png ├── load_contactscategories_jsonl_to_mysql.odt ├── load_contactscategories_jsonl_to_mysql.pdf ├── website_categories.ddl ├── README.md ├── website_contacts_categories.ddl ├── load_contactscategories_jsonl_to_mysql.py └── load_contactscategories_jsonl_to_mysql.txt ├── whoisxmlapi_mysqldump_loaders ├── legacy │ ├── load_mysql_data_all_for_all_tlds.sh │ ├── load_mysql_data_per_tables_for_all_tlds.sh │ ├── load_mysql_schema.sh │ ├── load_mysql_data_all.sh │ ├── load_mysql_data_per_tables_for_tld.sh │ ├── load_mysql_data_all_for_tld.sh │ └── load_mysql_data_per_tables.sh ├── load_mysql_utils.sh ├── README ├── load_mysql_data_all.sh └── load_mysql_data_per_tables.sh ├── whoisxmlapi_percona_loader_scripts ├── load_mysql_utils.sh ├── legacy │ └── restore_db.sh ├── README.txt ├── whoiscrawler_mysql_schema.sql └── load_whois_percona.sh ├── .gitignore ├── README.md ├── whoisxmlapi_csv2json ├── transform_json.py ├── transform_json_verbose.py └── README └── whoisxmlapi_flexible_csv_to_mysqldb ├── field_types.csv └── README /whoisxmlapi_download_whois_data/whois_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /whoisxmlapi_whoisdownload_bash/supported_gtlds: -------------------------------------------------------------------------------- 1 | com,net,org,info,us,biz,mobi,pro,asia,aero,tel,name 2 | -------------------------------------------------------------------------------- /UsingScriptsOnWindows10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/UsingScriptsOnWindows10.pdf -------------------------------------------------------------------------------- /legacy_scripts/get_whois_info-0.0.4.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/legacy_scripts/get_whois_info-0.0.4.zip -------------------------------------------------------------------------------- /legacy_scripts/get_whois_info-0.0.5.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/legacy_scripts/get_whois_info-0.0.5.zip -------------------------------------------------------------------------------- /legacy_scripts/get_whois_info-0.0.6.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/legacy_scripts/get_whois_info-0.0.6.zip -------------------------------------------------------------------------------- /whoisxmlapi_bash_csv_to_mysqldb/countries: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/whoisxmlapi_bash_csv_to_mysqldb/countries -------------------------------------------------------------------------------- /legacy_scripts/get_whois_info_python-0.0.6.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/legacy_scripts/get_whois_info_python-0.0.6.zip -------------------------------------------------------------------------------- /netblocks_csv_to_mysqldb/NetblocksRDB_Diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/netblocks_csv_to_mysqldb/NetblocksRDB_Diagram.png -------------------------------------------------------------------------------- /legacy_scripts/README: -------------------------------------------------------------------------------- 1 | These are previous versions of downloader scripts which are not 2 | developed anymore. They are kept here for clients who still use them 3 | for compatibility purposes. 4 | -------------------------------------------------------------------------------- /netblocks_csv_to_mysqldb/load_netblocks_data_to_mysql.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/netblocks_csv_to_mysqldb/load_netblocks_data_to_mysql.odt -------------------------------------------------------------------------------- /netblocks_csv_to_mysqldb/load_netblocks_data_to_mysql.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/netblocks_csv_to_mysqldb/load_netblocks_data_to_mysql.pdf -------------------------------------------------------------------------------- /website_contactscats_to_mysqldb/website_categories_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/website_contactscats_to_mysqldb/website_categories_schema.png -------------------------------------------------------------------------------- /website_contactscats_to_mysqldb/website_contats_categories_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/website_contactscats_to_mysqldb/website_contats_categories_schema.png -------------------------------------------------------------------------------- /website_contactscats_to_mysqldb/load_contactscategories_jsonl_to_mysql.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/website_contactscats_to_mysqldb/load_contactscategories_jsonl_to_mysql.odt -------------------------------------------------------------------------------- /website_contactscats_to_mysqldb/load_contactscategories_jsonl_to_mysql.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whois-api-llc/whois_database_download_support/HEAD/website_contactscats_to_mysqldb/load_contactscategories_jsonl_to_mysql.pdf -------------------------------------------------------------------------------- /whoisxmlapi_whoisdownload_bash/BUGS: -------------------------------------------------------------------------------- 1 | -When downloading from whois_database 2 | there is no way to download sql schema + separate tables. 3 | This should be a format like sqlschematables, which downloads 4 | whoiscrawler_$version_$tld_mysql_schema.sql.gz 5 | and the tables subdirectory. 6 | 7 | -Sample data cannot be downloaded. 8 | -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/requirements.txt: -------------------------------------------------------------------------------- 1 | certifi>=2019.11.28 2 | cffi>=1.13.2 3 | chardet>=3.0.4 4 | configobj>=5.0.6 5 | configparser>=4.0.2 6 | cryptography>=3.4 7 | easygui>=0.98.1 8 | idna>=2.8 9 | pycparser>=2.19 10 | pycryptodome>=3.7.0 11 | pyOpenSSL>=19.1.0 12 | recordtype>=1.3 13 | requests>=2.22.0 14 | six>=1.13.0 15 | urllib3>=1.26.7 16 | urlparse2>=1.1.1 17 | -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/requirements_legacy.txt: -------------------------------------------------------------------------------- 1 | certifi>=2019.11.28 2 | cffi>=1.13.2 3 | chardet>=3.0.4 4 | configobj>=5.0.6 5 | configparser>=4.0.2 6 | cryptography>=2.8 7 | easygui>=0.98.1 8 | idna>=2.8 9 | pycparser>=2.19 10 | pycrypto>=2.6.1 11 | pyOpenSSL>=19.1.0 12 | recordtype>=1.3 13 | requests>=2.22.0 14 | six>=1.13.0 15 | urllib3>=1.25.7 16 | urlparse2>=1.1.1 17 | -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/requirements_windows.txt: -------------------------------------------------------------------------------- 1 | certifi>=2019.11.28 2 | cffi>=1.13.2 3 | chardet>=3.0.4 4 | configobj>=5.0.6 5 | configparser>=4.0.2 6 | cryptography>=2.8 7 | easygui>=0.98.1 8 | idna>=2.8 9 | pycparser>=2.19 10 | pycryptodome>=2.6.1 11 | pyOpenSSL>=19.1.0 12 | recordtype>=1.3 13 | requests>=2.22.0 14 | six>=1.13.0 15 | urllib3>=1.25.7 16 | urlparse2>=1.1.1 17 | -------------------------------------------------------------------------------- /whoisxmlapi_bash_csv_to_mysqldb/CHANGELOG.txt: -------------------------------------------------------------------------------- 1 | Changelog for WhoisXML API 2 | 3 | CSV importing scripts and schema 4 | 5 | -0.0.2: 2018-01-10: 6 | o initial release on GitHub 7 | 8 | -0.0.3: 2018-11-01: 9 | o a critical bug fixed: the script now handles both 10 | UNIX-style (LF) ad Windows-style (CRLF) terminated 11 | input csv files. 12 | o changelog added 13 | o unnecessary files removed 14 | 15 | -------------------------------------------------------------------------------- /whoisxmlapi_mysqldump_loaders/legacy/load_mysql_data_all_for_all_tlds.sh: -------------------------------------------------------------------------------- 1 | src_root_dir="$1" 2 | version="$2" 3 | db_username="$3" 4 | db_password="$4" 5 | 6 | if [ ! -d "$src_root_dir" ]; then 7 | echo "src_root_dir $src_root_dir is not valid" 8 | exit 9 | fi 10 | if [ -z "$version" ]; then 11 | echo "version is missing" 12 | exit 13 | fi 14 | 15 | tlds="asia us biz mobi info org net com" 16 | for tld in $tlds; do 17 | ./load_mysql_data_all_for_tld.sh $src_root_dir $tld $version $db_username $db_password 18 | done 19 | -------------------------------------------------------------------------------- /whoisxmlapi_mysqldump_loaders/legacy/load_mysql_data_per_tables_for_all_tlds.sh: -------------------------------------------------------------------------------- 1 | src_root_dir="$1" 2 | version="$2" 3 | db_username="$3" 4 | db_password="$4" 5 | 6 | if [ ! -d "$src_root_dir" ]; then 7 | echo "src_root_dir $src_root_dir is not valid" 8 | exit 9 | fi 10 | if [ -z "$version" ]; then 11 | echo "version is missing" 12 | exit 13 | fi 14 | 15 | tlds="pro coop asia us biz mobi info org net com" 16 | for tld in $tlds; do 17 | ./load_mysql_data_per_tables_for_tld.sh $src_root_dir $tld $version $db_username $db_password 18 | done 19 | -------------------------------------------------------------------------------- /website_contactscats_to_mysqldb/website_categories.ddl: -------------------------------------------------------------------------------- 1 | /* 2 | Sample schema file for Website Contacts and Categories MySQLddatabase 3 | Categories-only version 4 | v 0.0 5 | (c) WhoisXML API, Inc. 6 | */ 7 | 8 | CREATE TABLE category( 9 | category VARCHAR(255) PRIMARY KEY 10 | ); 11 | 12 | CREATE TABLE domain( 13 | domainID INTEGER PRIMARY KEY AUTO_INCREMENT, 14 | domainName VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 15 | countryCode VARCHAR(2) 16 | ); 17 | 18 | CREATE TABLE domain_category( 19 | categoryID VARCHAR(255), 20 | domainID INTEGER, 21 | PRIMARY KEY (categoryID, domainID) 22 | ); 23 | 24 | -------------------------------------------------------------------------------- /whoisxmlapi_mysqldump_loaders/legacy/load_mysql_schema.sh: -------------------------------------------------------------------------------- 1 | schema_file="$1" 2 | if [ ! -f "$schema_file" ]; then 3 | echo "invalid schema file $schema_file"; 4 | exit 5 | fi 6 | db="$2" 7 | if [ -z "$db" ]; then 8 | echo "db is missing" 9 | exit 10 | fi 11 | db_username="$3" 12 | if [ -z "$db_username" ]; then 13 | echo "db username is missing" 14 | exit 15 | fi 16 | db_password="$4" 17 | if [ -z "$db_password" ]; then 18 | echo "db_password is missing" 19 | exit 20 | fi 21 | mysql -u$db_username -p$db_password -e "create database $db" 22 | if [ ${schema_file: -3} == ".gz" ]; then 23 | 24 | gunzip<$schema_file | mysql -u$db_username -p$db_password $db 25 | else 26 | mysql -u$db_username -p$db_password $db <$schema_file 27 | fi 28 | -------------------------------------------------------------------------------- /netblocks_csv_to_mysqldb/README.md: -------------------------------------------------------------------------------- 1 | A Python script to create and maintain a MySQL netblocks database 2 | 3 | using csv format netblocks data downloaded 4 | 5 | from a Whois XML netblocks database subscription 6 | 7 | (c) Whois XML API, Inc. 2019. 8 | 9 | ver. 0.0.3 10 | 11 | Contents 12 | -------- 13 | 14 | load_netblocks_data_to_mysql.py - the script 15 | 16 | Documentation: 17 | 18 | README.md - this file 19 | 20 | load_netblocks_data_to_mysql.txt - Documentation - UTF-8 text format 21 | load_netblocks_data_to_mysql.pdf - Documentation - pdf format 22 | load_netblocks_data_to_mysql.odt - Documentation - OpenDocument text format 23 | load_netblocks_data_to_mysql.org - Documentation - emacs .org format 24 | 25 | Database diagram (to supplement the text-format Documentation): 26 | 27 | NetblocksRDB_Diagram.svg - svg format 28 | NetblocksRDB_Diagram.png - png format 29 | 30 | 31 | Consult the Documentation for further details 32 | -------------------------------------------------------------------------------- /whoisxmlapi_mysqldump_loaders/load_mysql_utils.sh: -------------------------------------------------------------------------------- 1 | #Utilities to be included into mysql loader scripts 2 | 3 | # 4 | # Prints the version number and exits. 5 | # 6 | function printVersionAndExit() 7 | { 8 | echo "$MYNAME Version $VERSION" 9 | echo "" 10 | exit 0 11 | } 12 | # 13 | # 14 | # Prints all the arguments but only if the program is in the verbose mode. 15 | # 16 | function printVerbose() 17 | { 18 | if [ "$VERBOSE" == "true" ]; then 19 | echo $* >&2 20 | fi 21 | } 22 | 23 | # 24 | # Prints an error message to the standard error. The text will not mixed up with 25 | # the data that is printed to the standard output. 26 | # 27 | function printError() 28 | { 29 | echo "$*" >&2 30 | } 31 | 32 | function printMessage() 33 | { 34 | echo -n "$*" >&2 35 | } 36 | 37 | function printMessageNl() 38 | { 39 | echo "$*" >&2 40 | } 41 | 42 | function printDebug() 43 | { 44 | if [ "$DEBUG" == "yes" ]; then 45 | echo "$*" >&2 46 | fi 47 | } 48 | 49 | -------------------------------------------------------------------------------- /whoisxmlapi_percona_loader_scripts/load_mysql_utils.sh: -------------------------------------------------------------------------------- 1 | #Utilities to be included into mysql loader scripts 2 | 3 | # 4 | # Prints the version number and exits. 5 | # 6 | function printVersionAndExit() 7 | { 8 | echo "$MYNAME Version $VERSION" 9 | echo "" 10 | exit 0 11 | } 12 | # 13 | # 14 | # Prints all the arguments but only if the program is in the verbose mode. 15 | # 16 | function printVerbose() 17 | { 18 | if [ "$VERBOSE" == "true" ]; then 19 | echo $* >&2 20 | fi 21 | } 22 | 23 | # 24 | # Prints an error message to the standard error. The text will not mixed up with 25 | # the data that is printed to the standard output. 26 | # 27 | function printError() 28 | { 29 | echo "$*" >&2 30 | } 31 | 32 | function printMessage() 33 | { 34 | echo -n "$*" >&2 35 | } 36 | 37 | function printMessageNl() 38 | { 39 | echo "$*" >&2 40 | } 41 | 42 | function printDebug() 43 | { 44 | if [ "$DEBUG" == "yes" ]; then 45 | echo "$*" >&2 46 | fi 47 | } 48 | 49 | -------------------------------------------------------------------------------- /whoisxmlapi_mysqldump_loaders/legacy/load_mysql_data_all.sh: -------------------------------------------------------------------------------- 1 | schema_file="$1" 2 | if [ ! -f "$schema_file" ]; then 3 | echo "invalid schema file $schema_file"; 4 | exit 5 | fi 6 | dump_file="$2" 7 | if [ ! -f "$dump_file" ]; then 8 | echo "please specify a valid mysqldump file" 9 | exit 10 | fi 11 | db="$3" 12 | if [ -z "$db" ]; then 13 | echo "db is missing" 14 | exit 15 | fi 16 | db_username="$4" 17 | if [ -z "$db_username" ]; then 18 | echo "db username is missing" 19 | exit 20 | fi 21 | db_password="$5" 22 | if [ -z "$db_password" ]; then 23 | echo "db_password is missing" 24 | exit 25 | fi 26 | ./load_mysql_schema.sh $schema_file $db $db_username $db_password 27 | time=`date +%s` 28 | echo "loading data from file $dump_file" 29 | if [ ${dump_file: -3} == ".gz" ]; then 30 | 31 | zcat "$dump_file" |mysql -u$db_username -p$db_password $db 32 | else 33 | 34 | mysql -u$db_username -p$db_password $db <$dump_file 35 | fi 36 | 37 | time2=`date +%s` 38 | dur=`expr $time2 - $time` 39 | echo "took $dur seconds" 40 | 41 | -------------------------------------------------------------------------------- /website_contactscats_to_mysqldb/README.md: -------------------------------------------------------------------------------- 1 | A script to load Website Contacts & Categories data 2 | 3 | in jsonl format, downloaded from the 4 | 5 | Website Contacts & Categorization database by WhoisXML API, INC 6 | 7 | (https://website-contacts-database.whoisxmlapi.com) 8 | 9 | into a MySQL database. 10 | 11 | (c) Whois XML API, Inc. 2019. 12 | 13 | ver. 0.0.1 14 | 15 | (first beta release) 16 | 17 | CONTENTS: 18 | 19 | load_contactscategories_jsonl_to_mysql.py - the script 20 | 21 | recommended DB schema SQL-s: 22 | 23 | website_categories.ddl - categories only 24 | website_contacts_categories.ddl - categories and contacts 25 | 26 | Documentation: 27 | 28 | README.md - this file 29 | 30 | load_contactscategories_jsonl_to_mysql.txt - Documentation - UTF-8 text format 31 | load_contactscategories_jsonl_to_mysql.pdf - Documentation - pdf format 32 | load_contactscategories_jsonl_to_mysql.odt - Documentation - OpenDocument text format 33 | 34 | 35 | Database diagrams (to supplement the text-format Documentation): 36 | 37 | website_categories_schema.png 38 | webiste_contats_categories_schema.png -------------------------------------------------------------------------------- /whoisxmlapi_mysqldump_loaders/legacy/load_mysql_data_per_tables_for_tld.sh: -------------------------------------------------------------------------------- 1 | src_root_dir="$1" 2 | tld="$2" 3 | version="$3" 4 | db_username="$4" 5 | db_password="$5" 6 | 7 | if [ ! -d "$src_root_dir" ]; then 8 | echo "src_root_dir $src_root_dir is not valid" 9 | exit 10 | fi 11 | if [ -z "$tld" ]; then 12 | echo "tld is missing" 13 | exit 14 | fi 15 | if [ -z "$version" ]; then 16 | echo "version is missing" 17 | exit 18 | fi 19 | 20 | schema_file="$src_root_dir/$tld/whoiscrawler_"$version"_$tld"_mysql_schema.sql.gz 21 | schema_file2="$src_root_dir/$tld/whoiscrawler_$tld"_mysql_schema.sql.gz 22 | 23 | if [ ! -f "$schema_file" ] && [ ! -f "$schema_file2" ]; then 24 | echo "invalid schema file $schema_file"; 25 | exit 26 | fi 27 | tables_dir=$src_root_dir/$tld/tables 28 | 29 | if [ ! -d "$tables_dir" ]; then 30 | echo "no valid tables dir $tables_dir" 31 | exit 32 | fi 33 | db=whoiscrawler_$version"_$tld" 34 | if [ ! -f "$schema_file" ]; then 35 | schema_file=$schema_file2 36 | fi 37 | if [ ! -f "$dump_file" ]; then 38 | dump_file="$dump_file2" 39 | fi 40 | ./load_mysql_data_per_tables.sh $schema_file $tables_dir $db $db_username $db_password -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/HOWTO_add_a_new_feed.txt: -------------------------------------------------------------------------------- 1 | For developers/testers: 2 | 3 | how to add a new feed 4 | 5 | 1. Open feeds.ini with your favourite programming text editor 6 | 7 | 2. Add a new section (to a logical place) with the section name 8 | 9 | feed__dataformat 10 | 11 | Note: as ini files do not support hierarchy, you will need a record 12 | for each dataformat. Take care that the common data should be similar. 13 | 14 | See the examples already there. 15 | 16 | 3. Fill the required feeds according to the already added feeds. In 17 | file names you can use the follwing strings which will be substituted: 18 | $dbversion: quarterly database version, e.g. 'v6' or 'v20' 19 | $date: date string for daily feeds, e.g. '2017_08_20' 20 | $minusdate: date string for daily feeds in the format e.g. '2017-08-20' 21 | $_date: date string preceded by an underscore, e.g. '_2017_08_20' 22 | $tld: tld, e.g. 'com' or 'ac.at' 23 | $tldunderscore: tld string, dots replaced by underscores, e.g. 'ac_at' for 24 | 'ac.at' 25 | $filename: a file name. Used in file masks which are derived from files such as md5mask 26 | $ALLFILES: all files of the directory. Typically for subdirectories with csvs. 27 | If you need more, let us know. 28 | 29 | 4. Give it a try. 30 | -------------------------------------------------------------------------------- /whoisxmlapi_mysqldump_loaders/legacy/load_mysql_data_all_for_tld.sh: -------------------------------------------------------------------------------- 1 | src_root_dir="$1" 2 | tld="$2" 3 | version="$3" 4 | db_username="$4" 5 | db_password="$5" 6 | 7 | if [ ! -d "$src_root_dir" ]; then 8 | echo "src_root_dir $src_root_dir is not valid" 9 | exit 10 | fi 11 | if [ -z "$tld" ]; then 12 | echo "tld is missing" 13 | exit 14 | fi 15 | if [ -z "$version" ]; then 16 | echo "version is missing" 17 | exit 18 | fi 19 | 20 | schema_file="$src_root_dir/$tld/whoiscrawler_"$version"_$tld"_mysql_schema.sql.gz 21 | schema_file2="$src_root_dir/$tld/whoiscrawler_$tld"_mysql_schema.sql.gz 22 | 23 | if [ ! -f "$schema_file" ] && [ ! -f "$schema_file2" ]; then 24 | echo "invalid schema file $schema_file"; 25 | exit 26 | fi 27 | dump_file=$src_root_dir/$tld/whoiscrawler_"$tld"_mysql.sql.gz 28 | dump_file2=$src_root_dir/$tld/whoiscrawler_$version"_$tld"_mysql.sql.gz 29 | 30 | if [ ! -f "$dump_file" ] && [ ! -f "$dump_file2" ]; then 31 | echo "no valid mysqldump file $dump_file or $dump_file2 found" 32 | exit 33 | fi 34 | db=whoiscrawler_$version"_$tld" 35 | if [ ! -f "$schema_file" ]; then 36 | schema_file=$schema_file2 37 | fi 38 | if [ ! -f "$dump_file" ]; then 39 | dump_file="$dump_file2" 40 | fi 41 | ./load_mysql_data_all.sh $schema_file $dump_file $db $db_username $db_password -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/new_generation_plans.dat: -------------------------------------------------------------------------------- 1 | pro:domain_names_dropped,domain_names_new,ngtlds_domain_names_dropped,ngtlds_domain_names_new 2 | enterprise:domain_names_dropped,domain_names_dropped_whois,domain_names_new,domain_names_whois,ngtlds_domain_names_dropped,ngtlds_domain_names_dropped_whois,ngtlds_domain_names_new,ngtlds_domain_names_whois 3 | custom1:domain_names_dropped,domain_names_dropped_whois,domain_names_new,domain_names_whois,domain_names_whois_archive,domain_names_whois_filtered_reg_country,ngtlds_domain_names_dropped,ngtlds_domain_names_dropped_whois,ngtlds_domain_names_new,ngtlds_domain_names_whois,ngtlds_domain_names_whois_archive,ngtlds_domain_names_whois_filtered_reg_country,ngtlds_domain_names_whois_filtered_reg_country_archive 4 | custom2:domain_names_dropped,domain_names_dropped_whois,domain_names_new,domain_names_whois,domain_names_whois_archive,domain_names_whois_filtered_reg_country,domain_names_whois_filtered_reg_country_noproxy,domain_names_whois_filtered_reg_country_noproxy_archive,ngtlds_domain_names_dropped,ngtlds_domain_names_dropped_whois,ngtlds_domain_names_new,ngtlds_domain_names_whois,ngtlds_domain_names_whois_archive,ngtlds_domain_names_whois_filtered_reg_country,ngtlds_domain_names_whois_filtered_reg_country_archive,ngtlds_domain_names_whois_filtered_reg_country_noproxy,ngtlds_domain_names_whois_filtered_reg_country_noproxy_archive 5 | lite:domain_names_new,ngtlds_domain_names_new 6 | -------------------------------------------------------------------------------- /whoisxmlapi_whoisdownload_bash/install_p12pack.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #A simple utility to convert pkcs12 files to certificates to be used 3 | #with the ssl-auth version of whoisdownload.bash 4 | 5 | 6 | if [[ $1 == "" || $1 == "--help" ]];then 7 | cat >&2 <&2 28 | exit 1 29 | fi 30 | if [ -z $2 ];then 31 | echo "ERROR: Your password is needed" >&2 32 | exit 1 33 | fi 34 | 35 | 36 | IN_PKCS="$1" 37 | IN_PW="$2" 38 | 39 | openssl pkcs12 -clcerts -nokeys -in "$IN_PKCS" -out client.crt -password pass:"$IN_PW" -passin pass:"$IN_PW" 40 | openssl pkcs12 -cacerts -nokeys -in "$IN_PKCS" -out whoisxmlapi.ca -password pass:"$IN_PW" -passin pass:"$IN_PW" 41 | openssl pkcs12 -nocerts -in "$IN_PKCS" -out private.key -password pass:"$IN_PW" -passin pass:"$IN_PW" -passout pass:"$IN_PW" 42 | openssl rsa -in private.key -out "client.key" -passin pass:"$IN_PW" 43 | rm private.key 44 | chmod 400 client.* whoisxmlapi.ca 45 | 46 | echo "All done. Now you can use the downloader script in this directory." 47 | -------------------------------------------------------------------------------- /whoisxmlapi_bash_csv_to_mysqldb/load_csv_file_into_db_old.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | USAGE="USAGE:\n\t$0 db csv_file schema mode" 4 | 5 | db="$1" 6 | csv_file="$2" 7 | schema="$3" 8 | mode="$4" 9 | 10 | if [[ $# -ne 4 ]] 11 | then 12 | echo -e "${USAGE}" 13 | exit 1 14 | fi 15 | 16 | if [ -z "$db" ]; then 17 | echo -e "db is missing\n${USAGE}" 18 | exit 1 19 | fi 20 | if [ ! -f "$csv_file" ]; then 21 | echo -e "csv_file $csv_file doesn't exist\n${USAGE}" 22 | exit 1 23 | fi 24 | if [ ! -f "$schema" ]; then 25 | echo -e "schema file $schema doesn't exist\n${USAGE}" 26 | exit 1 27 | fi 28 | csv_file=`readlink -e $csv_file` 29 | schema=`readlink -e $schema` 30 | case ${mode} in 31 | simple|regular ) 32 | table="whois_record_flat_${mode}" 33 | ;; 34 | full ) 35 | table="whois_record_flat" 36 | ;; 37 | * ) 38 | echo "mode must be specified(simple, regular, or full)" 39 | exit 1 40 | ;; 41 | esac 42 | 43 | if [[ -z $(mysql -A --skip-column-names ${db} <<< "SHOW TABLES LIKE \"${table}\";") ]] 44 | then 45 | mysql ${db} --verbose <${schema} 46 | fi 47 | 48 | fields=$(head -n 1 ${csv_file}|sed 's/"//g') 49 | #nfields=$(echo ${fields}|awk -F\, '{print NF}') 50 | #ncolumns=$(mysql -A --skip-column-names ${db} <<< "SHOW COLUMNS FROM ${table};"|wc -l) 51 | #if [[ ${nfields} -ne ${ncolumns} ]] 52 | #then 53 | # echo "Fatal: number of fileds ${nfields} not equal to nomber of columns ${ncolumns} in table ${table}" 54 | # exit 1 55 | #fi 56 | 57 | mysql ${db} --verbose -e "load data infile \"${csv_file}\" IGNORE into table $table 58 | fields terminated by ',' enclosed by '\"' LINES TERMINATED BY '\n' IGNORE 1 LINES 59 | (${fields})" 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/whois_utils/whois_user_interaction.py: -------------------------------------------------------------------------------- 1 | # User interaction module of Whois API LLC end user scripts 2 | # 3 | #Copyright (c) 2010-2021 Whois API LLC, http://www.whoisxmlapi.com 4 | # 5 | 6 | import sys 7 | import os 8 | 9 | import argparse 10 | from argparse import RawTextHelpFormatter 11 | 12 | import easygui as g 13 | 14 | import re 15 | 16 | #Logging functions 17 | def print_error_and_exit(message): 18 | global DIALOG_COMMUNICATION 19 | if DIALOG_COMMUNICATION: 20 | _ = g.msgbox('Error. \n ' + message +'\nExiting.','WhoisXML API MySQL loader script') 21 | exit(1) 22 | else: 23 | sys.stderr.write('\nError: ' + message+'\n') 24 | sys.stderr.flush() 25 | exit(1) 26 | def print_verbose(message): 27 | global VERBOSE 28 | global DEBUG 29 | if VERBOSE or DEBUG: 30 | sys.stderr.write(message + '\n') 31 | sys.stderr.flush() 32 | def print_debug(message): 33 | global DEBUG 34 | if DEBUG: 35 | sys.stderr.write(message + '\n') 36 | sys.stderr.flush() 37 | #File and directory utilites 38 | def get_file(path, message): 39 | """Given a whatever path, verifies if it points to a file. 40 | If not, gives the error message and the path. 41 | If yes,returns the file path""" 42 | thefile = os.path.normpath(path) 43 | if not os.path.isfile(thefile): 44 | print_error_and_exit(message +'\n (File specified: %s)' %(path)) 45 | else: 46 | return(thefile) 47 | 48 | def get_directory(path, message): 49 | """Given a whatever path, verifies if it points to a directory. 50 | If not, gives the error message and the path. 51 | If yes,returns the file path as a pathlib object""" 52 | thefile = os.path.normpath(path) 53 | if not os.path.isdir(thefile): 54 | print_error_and_exit(message +'\n (Directory specified: %s)' %(path)) 55 | else: 56 | return(thefile) 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | These are scripts provided by WhoisXML API, Inc., 2 | for clients using WHOIS data feeds to obtain bulk WHOIS data 3 | or to set up a WHOIS database. 4 | 5 | The contents of the subdirectories and files are: 6 | 7 | UsingScriptsOnWindows10.pdf : a brief blog describing how to use 8 | all features of all our tools on 9 | Windows 10 systems. 10 | 11 | whoisxmlapi_download_whois_data: a Python2 script for downloading 12 | bulk data from daily and quarterly WHOIS 13 | data feeds in various formats. 14 | It can be used from command line, 15 | but also supports a simple GUI. 16 | For all platforms. 17 | 18 | whoisxmlapi_whoisdownload_bash: a bash script for downloading bulk 19 | data from daily and quarterly WHOIS 20 | data feeds. 21 | 22 | whoisxmlapi_csv2json: a Python3 script which converts WhoisXML API 23 | csv files to json files. 24 | 25 | whoisxmlapi_bash_csv_to_mysqldb: bash scripts to create and maintain 26 | WHOIS databases in MySQL 27 | based on csv files downloaded from 28 | WhoisXML API. 29 | If you do not insist on bash, 30 | check also 31 | whoisxmlapi_flexible_csv_to_mysqldb 32 | which is in Python 3 33 | and provides extended functionality. 34 | 35 | whoisxmlapi_flexible_csv_to_mysqldb: 36 | a flexible and portable script in Python 37 | to create and maintain 38 | WHOIS databases in MySQL 39 | based on csv files downloaded from 40 | WhoisXML API. 41 | 42 | whoisxmlapi_mysqldump_loaders: Python2 and bash scripts to set up a 43 | WHOIS database in MySQL, 44 | using the data obtained from 45 | WhoisXML API quarterly data feeds. 46 | 47 | whoismxlapi_percona_loaders: bash scripts for loading binary MySQL 48 | dumps of quarterly releases where available 49 | 50 | legacy_scripts: miscellaneous legacy scripts not developed 51 | anymore, published for compatibility reasons. 52 | 53 | 54 | -------------------------------------------------------------------------------- /whoisxmlapi_percona_loader_scripts/legacy/restore_db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 3 | cd "$DIR" || exit 1 4 | 5 | DB=$1 6 | RESTORE_DIR=$2 7 | DB_DATA_DIR=${3:-/var/lib/mysql} 8 | SCHEMA_FILE=${4:-$DIR/whoiscrawler_mysql_schema.sql} 9 | 10 | TABLES="contact domain_names_whoisdatacollector registry_data whois_record whois_record_ids_whoisdatacollector" 11 | 12 | if [ -z "$DB" ]; then 13 | echo "db is missing" 14 | exit 15 | fi 16 | 17 | if [ ! -d "$RESTORE_DIR" ]; then 18 | echo "restore_dir $RESTORE_DIR must be valid, we will copy data from this directory to your database data directory" 19 | exit 20 | fi 21 | 22 | if [ ! -d "$DB_DATA_DIR" ]; then 23 | echo "db_data_dir $DB_DATA_DIR must be valid, this should probably be /var/lib/mysql/ we will copy data from restore_dir to this directory/db_name" 24 | exit 25 | fi 26 | 27 | if [ ! -f "$SCHEMA_FILE" ]; then 28 | echo "schema_file $SCHEMA_FILE is missing" 29 | exit 30 | fi 31 | 32 | 33 | echo "creating database $DB" 34 | mysql -e "create database $DB" 35 | mysql "$DB" < "$SCHEMA_FILE" 36 | 37 | 38 | if [ ! -d "$DB_DATA_DIR/$DB" ]; then 39 | echo "$DB_DATA_DIR/$DB doesn't exist!" 40 | exit 41 | fi 42 | 43 | 44 | echo "importing tablespaces" 45 | G_START_TIME=$(date +%s) 46 | 47 | for table in $TABLES; do 48 | START_TIME=$(date +%s) 49 | q="set FOREIGN_KEY_CHECKS=0;ALTER TABLE $DB.$table DISCARD TABLESPACE;" 50 | echo "$q" 51 | mysql -e "$q" 52 | 53 | file="$table.ibd" 54 | echo "copy table file $file from $RESTORE_DIR/$DB to $DB_DATA_DIR/$DB" 55 | cp "$RESTORE_DIR/$DB/$file" "$DB_DATA_DIR/$DB/" 56 | 57 | chown -R mysql:mysql "$DB_DATA_DIR/$DB" 58 | 59 | q="ALTER TABLE $DB.$table IMPORT TABLESPACE" 60 | echo "$q" 61 | mysql -e "$q" 62 | 63 | END_TIME=$(date +%s) 64 | DUR=$((END_TIME-START_TIME)) 65 | echo "import table $table took $DUR seconds" 66 | done 67 | 68 | G_END_TIME=$(date +%s) 69 | GDUR=$((G_END_TIME-G_START_TIME)) 70 | echo "import tables took $GDUR seconds" 71 | 72 | -------------------------------------------------------------------------------- /whoisxmlapi_bash_csv_to_mysqldb/loader_schema_simple.sql: -------------------------------------------------------------------------------- 1 | create table whois_record_flat_simple ( 2 | `whois_record_flat_id` bigint(20) NOT NULL AUTO_INCREMENT, 3 | `domainName` varchar(256), 4 | `registrarName` varchar(512), 5 | `contactEmail` varchar(256), 6 | `whoisServer` varchar(512), 7 | `nameServers` varchar(256), 8 | `createdDate` varchar(256), 9 | `updatedDate` varchar(256), 10 | `expiresDate` varchar(256), 11 | `standardRegCreatedDate` varchar(256), 12 | `standardRegUpdatedDate` varchar(256), 13 | `standardRegExpiresDate` varchar(256), 14 | `status` text, 15 | `Audit_auditUpdatedDate` varchar(256), 16 | `registrant_email` varchar(256), 17 | `registrant_name` varchar(256), 18 | `registrant_organization` varchar(256), 19 | `registrant_street1` varchar(256), 20 | `registrant_street2` varchar(256), 21 | `registrant_street3` varchar(256), 22 | `registrant_street4` varchar(256), 23 | `registrant_city` varchar(64), 24 | `registrant_state` varchar(256), 25 | `registrant_postalCode` varchar(45), 26 | `registrant_country` varchar(45), 27 | `registrant_fax` varchar(45), 28 | `registrant_faxExt` varchar(45), 29 | `registrant_telephone` varchar(45), 30 | `registrant_telephoneExt` varchar(45), 31 | `administrativeContact_email` varchar(256), 32 | `administrativeContact_name` varchar(256), 33 | `administrativeContact_organization` varchar(256), 34 | `administrativeContact_street1` varchar(256), 35 | `administrativeContact_street2` varchar(256), 36 | `administrativeContact_street3` varchar(256), 37 | `administrativeContact_street4` varchar(256), 38 | `administrativeContact_city` varchar(64), 39 | `administrativeContact_state` varchar(256), 40 | `administrativeContact_postalCode` varchar(45), 41 | `administrativeContact_country` varchar(45), 42 | `administrativeContact_fax` varchar(45), 43 | `administrativeContact_faxExt` varchar(45), 44 | `administrativeContact_telephone` varchar(45), 45 | `administrativeContact_telephoneExt` varchar(45), 46 | `registrarIANAID` varchar(45), 47 | primary key (`whois_record_flat_id`) 48 | )ENGINE=InnoDB ROW_FORMAT=COMPRESSED AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; 49 | -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/install_p12.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #This utility extracts a p12 pack obtained from WhoisXML API Inc. 3 | #Into files that can be used with downloader scripts. 4 | 5 | import sys 6 | import os 7 | from OpenSSL import crypto as c 8 | try: 9 | from Crypto.PublicKey import RSA 10 | newcryptolib = False 11 | except ModuleNotFoundError: 12 | newcryptolib = True 13 | from Cryptodome.PublicKey import RSA 14 | import easygui as g 15 | 16 | windowtitle = 'WhoisXML API SSL pack converter' 17 | infile = g.fileopenbox('Choose the pack.p12 file obtained from WhoisXML API Inc.', 18 | windowtitle) 19 | password = g.passwordbox('Enter the password supplied with your pack', 20 | windowtitle) 21 | 22 | if newcryptolib: 23 | password = bytes(password, encoding='utf-8') 24 | try: 25 | p12 = c.load_pkcs12(open(infile, 'rb').read(), password) 26 | except: 27 | g.msgbox('Error: invalid pack or password. Exiting.') 28 | exit(6) 29 | 30 | try: 31 | cert = c.dump_certificate(c.FILETYPE_PEM, p12.get_certificate()) 32 | certfile = open('client.crt','wb') 33 | certfile.write(cert) 34 | certfile.close() 35 | 36 | key = c.dump_privatekey(c.FILETYPE_PEM, p12.get_privatekey()) 37 | rsakey = RSA.importKey(key) 38 | keyfile = open('client.key','wb') 39 | keyfile.write(rsakey.exportKey()) 40 | keyfile.close() 41 | os.chmod('client.key', 400) 42 | 43 | cacert = c.dump_certificate(c.FILETYPE_PEM, p12.get_ca_certificates()[0]) 44 | cacertfile = open('whoisxmlapi.ca','wb') 45 | cacertfile.write(cacert) 46 | cacertfile.close() 47 | except: 48 | g.msgbox('Error: could not overwrite one of the files.\nEnsure that the following files do not exist or can be overwritten:\n whoisxmlapi.ca\n client.crt\n client.key\n') 49 | exit(1) 50 | 51 | g.msgbox('The files needed for authentication:\n whoisxmlapi.ca\n client.crt\n client.key\n have been created.\nNow you can use ssl authentication.\n\nIMPORTANT: keep client.key secret!', windowtitle) 52 | -------------------------------------------------------------------------------- /website_contactscats_to_mysqldb/website_contacts_categories.ddl: -------------------------------------------------------------------------------- 1 | /* 2 | Sample schema file for Website Contacts and Categories MySQLddatabase 3 | v 0.0 4 | (c) WhoisXML API, Inc. 5 | */ 6 | 7 | CREATE TABLE category( 8 | category VARCHAR(255) PRIMARY KEY 9 | ); 10 | 11 | CREATE TABLE domain( 12 | domainID INTEGER PRIMARY KEY AUTO_INCREMENT, 13 | domainName VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 14 | countryCode VARCHAR(2), 15 | meta_title LONGBLOB, 16 | meta_description LONGBLOB, 17 | socialLinks_facebook TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 18 | socialLinks_googlePlus TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 19 | socialLinks_instagram TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 20 | socialLinks_twitter TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 21 | socialLinks_linkedIn TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci 22 | ); 23 | 24 | CREATE TABLE domain_category( 25 | categoryID VARCHAR(255), 26 | domainID INTEGER, 27 | PRIMARY KEY (categoryID, domainID) 28 | ); 29 | 30 | CREATE TABLE email( 31 | emailID INTEGER PRIMARY KEY AUTO_INCREMENT, 32 | domainID INTEGER, 33 | description LONGBLOB, 34 | email TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 35 | CONSTRAINT FK_email_domain FOREIGN KEY(domainID) REFERENCES domain(domainID) 36 | ); 37 | 38 | CREATE TABLE phone( 39 | phoneID INTEGER PRIMARY KEY AUTO_INCREMENT, 40 | domainID INTEGER, 41 | description LONGBLOB, 42 | phoneNumber TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 43 | callHours LONGBLOB, 44 | CONSTRAINT FK_phone_domain FOREIGN KEY(domainID) REFERENCES domain(domainID) 45 | ); 46 | 47 | CREATE TABLE postalAddress( 48 | postalAddressID INTEGER PRIMARY KEY AUTO_INCREMENT, 49 | domainID INTEGER, 50 | postalAddress LONGBLOB, 51 | CONSTRAINT FK_postalAddress_domain FOREIGN KEY(domainID) REFERENCES domain(domainID) 52 | ); 53 | 54 | CREATE TABLE companyName( 55 | companyNameID INTEGER PRIMARY KEY AUTO_INCREMENT, 56 | domainID INTEGER, 57 | companyName LONGBLOB, 58 | CONSTRAINT FK_company_domain FOREIGN KEY(domainID) REFERENCES domain(domainID) 59 | ); 60 | -------------------------------------------------------------------------------- /whoisxmlapi_mysqldump_loaders/legacy/load_mysql_data_per_tables.sh: -------------------------------------------------------------------------------- 1 | schema_file="$1" 2 | if [ ! -f "$schema_file" ]; then 3 | echo "invalid schema file $schema_file"; 4 | exit 5 | fi 6 | table_files_dir="$2" 7 | if [ ! -d "$table_files_dir" ]; then 8 | echo "please specify a valid directory where all table files reside in" 9 | exit 10 | fi 11 | db="$3" 12 | if [ -z "$db" ]; then 13 | echo "db is missing" 14 | exit 15 | fi 16 | db_username="$4" 17 | if [ -z "$db_username" ]; then 18 | echo "db username is missing" 19 | exit 20 | fi 21 | db_password="$5" 22 | if [ -z "$db_password" ]; then 23 | echo "db_password is missing" 24 | exit 25 | fi 26 | ./load_mysql_schema.sh $schema_file $db $db_username $db_password 27 | 28 | tables="whois_record registry_data contact domain_names_whoisdatacollector" 29 | 30 | mysql -u$db_username -p$db_password $db -e "alter table whois_record drop index domain_name_index;alter table whois_record drop index domain_name;" 31 | mysql -u$db_username -p$db_password $db -e "alter table registry_data drop index domain_name_index;alter table registry_data drop index domain_name;" 32 | 33 | table_files_dir=$table_files_dir/* 34 | 35 | for file in $table_files_dir; do 36 | 37 | time=`date +%s` 38 | if [ -f "$file" ]; then 39 | time=`date +%s` 40 | 41 | echo "loading data from file $file" 42 | if [ ${file: -3} == ".gz" ]; then 43 | { echo "SET autocommit = 0;" 44 | zcat "$file" 45 | echo "commit;" ; } | mysql -u$db_username -p$db_password --force $db 46 | elif [ ${file: -4} == ".sql" ]; then 47 | { echo "SET autocommit = 0;" 48 | cat "$file" 49 | echo "commit;" ; } | mysql -u$db_username -p$db_password --force $db 50 | fi 51 | 52 | fi 53 | 54 | time2=`date +%s` 55 | dur=`expr $time2 - $time` 56 | echo " loading $table from file $file took $dur seconds" 57 | 58 | done 59 | time=`date +%s` 60 | mysql --force -u$db_username -p$db_password $db -e "alter table whois_record add index domain_name_index(domain_name)" 61 | mysql --force -u$db_username -p$db_password $db -e "alter table registry_data add index domain_name_index(domain_name)" 62 | time2=`date +%s` 63 | dur=`expr $time2 - $time` 64 | echo " add indices took $dur seconds" 65 | 66 | -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/README_python2.txt: -------------------------------------------------------------------------------- 1 | Supplement for the documentation of the WhoisXML API 2 | 3 | Whois data download utility 4 | 5 | download_whois_data.py 6 | 7 | Release version 1.0.0 dated 2019-12-10. 8 | 9 | Copyright (c) 2010-2021 Whois API, Inc. http://www.whoisxmlapi.com 10 | ------------------------------------------------------------------- 11 | 12 | The present file supplements README.txt's as a replacement for steps 1 13 | and 2 in Section 2, Installation, intended for legacy Python 2 users 14 | as opposed to the Python 3-based description of the main README. It is 15 | recommended to switch to Python 3 as the support of Python 2 ends on 16 | 2020-01-01. 17 | 18 | Step 1: Install Python 19 | 20 | The script has been tested with Python 2.7.15. If for some reason you 21 | have an earlier main version of Python 2, such as Python 2.6, you 22 | shall have compatibility issues. (This is the case when you use the 23 | default Python on certain releases of CentOS or RHEL ver. 6.) It is 24 | always possible on those systems to set up Python 2.7.x in parallel, 25 | consult the documentation of your system. 26 | 27 | -On Linux systems, use your package manager, e.g. "apt-get install python". 28 | 29 | -On Windows systems, download the installer from 30 | www.python.org, series 2 (2.7.x) for your platform, then start and 31 | go through the installation procedure. Be careful to install with 32 | the following options enabled: 33 | -"Install pip" (this is the default) 34 | -"Add Python to path" 35 | 36 | Step 2: Install Dependencies 37 | 38 | Additional required python packages are: 39 | 40 | argparse 41 | easygui 42 | requests 43 | 44 | On both Windows and Linux you can install them by the 45 | (root/administrator) command-line: 46 | 47 | pip install 48 | 49 | where is one of the above three packages. Alternatively, 50 | you may find these as software packages for your system (aka "apt-get 51 | install python-easygui") 52 | 53 | If these steps were made, the script's dependencies are met, it is 54 | ready for use. 55 | 56 | The script supports the access of the data via ssl-encrypted pages 57 | using ssl key-based authentication. Those clients who want to use this 58 | possibility should read the file 59 | 60 | README.SSL.txt 61 | 62 | to do the necessary steps for configuring this kind of access. 63 | -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/CHANGELOG.txt: -------------------------------------------------------------------------------- 1 | Changelog for the WhoisXML API 2 | 3 | Whois data download utility 4 | 5 | download_whois_data.py 6 | 7 | 2023-06-07 8 | ---------- 9 | 10 | - Fixed a bug affecting newer versions of urllib3. 11 | (Modification tested on newer Linux environments and python3 only.) 12 | - Notes made on obsoleting sslauth in docs. 13 | 14 | 2022-03-30 15 | ---------- 16 | 17 | - Added the "domains", "verified_domains", "missing_domains", and " 18 | "reserved_domains" data formats (domain lists) to the quarterly 19 | database data feeds. 20 | 21 | 2021-08-18 22 | ---------- 23 | 24 | - Added the option --only-changed to use added/dropped tlds files 25 | instead of supported_tlds. Works only for feeds where these files 26 | are available. 27 | - Changed the default value of --maxtries to 3 upon users' request 28 | 29 | 2021-07-14 30 | ---------- 31 | - The broken --maxtries option has been fixed, the default value has 32 | been changed from 5 to 1 for performance reasons 33 | - Fixed a bug in the removal of empty supported tld 34 | 35 | 2021-06-01 36 | ---------- 37 | - A minor bug fixed in interactive mode. 38 | 39 | 2021-04-19 40 | ---------- 41 | - Added support for new-generation access: subscription plan-dependent 42 | URLs, at the moment typically at newly-registered-domains.whoisxmlapi.com 43 | 44 | 2021-03-19 45 | ---------- 46 | - Fixed an issue in the GUI mode affecting easygui >= 0.98.2 47 | - Added the temporary workaround of sslauth to GUI mode 48 | 49 | 2021-03-10 50 | ---------- 51 | - Recovered the temporary option for sslauth, corrected http to https in urls. 52 | 53 | 2021-02-22 54 | ---------- 55 | - Fixed a bug in downloading supported tlds lists, affecting --all-tlds option 56 | 57 | 2020-05-13 58 | ---------- 59 | - Introduced a mechanism to detect premature daily downloads. 60 | 61 | 2019-12-10 62 | ---------- 63 | 64 | - The script is made Python 3 compatible, the documentation has been 65 | amended accordingly. 66 | 67 | 2019-06-20 68 | ---------- 69 | - The option --list-feeds shows brief feed descriptions. 70 | - The --describe-feed option has been introduced. 71 | - A bug has been fixed which affected tld-independent feeds. 72 | - An error message is generated if feeds.ini cannot be read. 73 | 74 | 2018-11-27 75 | ---------- 76 | -Added support for the feed whois_record_delta_domain_names_change_archive 77 | -Fixed a bug of not reporting unavailable files for whole directory downloads 78 | 79 | 2018-11-12: 80 | ---------- 81 | -CHANGELOG.txt introduced 82 | -Added support for the data feed "gtlds_domain_names_whois_archive" 83 | -Fixed issues related to mixing tld dependent and independent data 84 | formats within the same session 85 | -Introduced the support for determining list of supported tlds 86 | for archive feeds with year-named subdirectories 87 | -Refined error message for non-existing resources 88 | -------------------------------------------------------------------------------- /whoisxmlapi_whoisdownload_bash/README.SSL: -------------------------------------------------------------------------------- 1 | Setting up SSL authentication to use with whoisdownload.sh 2 | 3 | install_p12pack.sh 4 | 5 | Provided by WhoisXML API, Inc. 6 | 7 | dated: 2018-02-01 8 | 9 | CONTENTS: 10 | --------- 11 | 12 | 1. Brief summary 13 | 14 | 2. Installing the auth credentials 15 | 16 | 3. Using the script 17 | 18 | 19 | 1. Brief summary 20 | ---------------- 21 | 22 | The goal is to set up your authentication credentials so that you can 23 | use whoisdownload.sh with the option --auth-type=ssl instead of 24 | password authentication. 25 | 26 | 2. Installing the auth credentials 27 | ---------------------------------- 28 | 29 | As a starting point we assume that you have obtained a p12 pack with 30 | certificates and keys, and a password for this file from WhoisXML API, 31 | Inc. (If it is not the case, you will not be able to use this script.) 32 | 33 | Let us call the obtained file pack.p12, and the password in the 34 | examples will be "YourPassword". 35 | 36 | You need to do the following just once: 37 | 38 | -Put pack.p12 into the directory where the download scripts (and this 39 | file) reside. 40 | 41 | -Make sure you have openssl installed. If you do not have it, install 42 | it (e.g. apt-get install openssl on Debian-favor systems (Ubuntu, 43 | Mint, Ubuntu on Windwos.) 44 | 45 | -Run the following command in the directory: 46 | 47 | ./install_p12pack.sh pack.p12 YourPassword 48 | 49 | which gives the following output: 50 | 51 | MAC verified OK 52 | MAC verified OK 53 | MAC verified OK 54 | writing RSA key 55 | All done. Now you can use the downloader script in this directory. 56 | 57 | Also, it will generate the files client.crt, client.key and whoisxmlapi.ca 58 | needed for the authentication. These will be only readable by you. 59 | 60 | IMPORTANT: keep the generated client.key confidential. 61 | 62 | If once this is done, you will be able to use the script 63 | 64 | 3. Using the script 65 | ------------------- 66 | 67 | To use ssl authentication instead of password authentication, add 68 | 69 | --auth-type=ssl 70 | 71 | to the options of the script when you use it. You will not need the 72 | --user and --password options then. 73 | 74 | The default location of the files client.crt, client.key and 75 | whoisxmlapi.ca is the same directory as your script. 76 | 77 | You may move these files elsewhere, and specify their location with 78 | the respective options, e.g. 79 | 80 | --cacert=/home/myuser/mycustomdir/whoisxmlapi.ca 81 | --sslcert=/home/myuser/mycustomdir/client.crt 82 | --sslkey=/home/myuser/mycustomdir/client.key 83 | 84 | You may also rename these files and specify them with their new names 85 | using the above options. As an alternative to the command-line 86 | options, you may specify the location of the respective files by 87 | setting the variables 88 | AUTHTYPE="ssl" 89 | and the variables 90 | CACERTFILE 91 | CERTFILE 92 | KEYFILE 93 | to the location of the auth files in your 94 | ~/.whoisdownload.sh 95 | file if you prefer this approach. 96 | IMPORTANT: the string providing file locations as well as those 97 | possibly given as values to variables specifying full path should be 98 | preferably full absolute paths and they should resolve properly by the 99 | "realpath" command on your system. Hence, for instance, do not use "~" 100 | to refer to your home directory in these path strings. 101 | -------------------------------------------------------------------------------- /whoisxmlapi_csv2json/transform_json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from argparse import ArgumentParser 4 | import csv 5 | import multiprocessing 6 | import json 7 | import sys 8 | import os 9 | from platform import system 10 | 11 | VERSION = "0.0.2" 12 | MYNAME = sys.argv[0].replace('./', '') 13 | 14 | # Preparing arguments 15 | argparser = ArgumentParser(description='Convert CSV to JSON format') 16 | argparser.add_argument('--version', 17 | help='Print version information and exit.', 18 | action='version', 19 | version=MYNAME + ' ver. ' + VERSION + '\n(c) WhoisXML API LLC.') 20 | argparser.add_argument('-i', '--path', 21 | help='input directory with uncompressed CSVs or single CSV file', 22 | type=str, required=True) 23 | argparser.add_argument('--key', help='primary key field for records, default "domainName"', 24 | type=str, default='domainName') 25 | argparser.add_argument('--threads', 26 | help='number of threads, default 1', 27 | type=int, default=1) 28 | argparser.add_argument('--force', help='overwrite existent files', action='store_true') 29 | argparser.add_argument('--human-readable', 30 | help='generate human readable output', 31 | action='store_true') 32 | args = argparser.parse_args() 33 | 34 | # increase max size of the field 35 | if system() == 'Windows': 36 | csv.field_size_limit(2147483647) 37 | else: 38 | from sys import maxsize as csv_maxsize 39 | csv.field_size_limit(csv_maxsize) 40 | 41 | 42 | def convert_json(csv_queue): 43 | while not csv_queue.empty(): 44 | csv_file = csv_queue.get() 45 | json_file = os.path.join( 46 | os.path.dirname(csv_file), 47 | os.path.basename(csv_file).replace('.csv', '.json')) 48 | if args.force or not os.path.isfile(json_file): 49 | out_data = dict() 50 | with open(csv_file, 'rt') as infile: 51 | infile_csv = csv.DictReader(infile) 52 | for in_row in infile_csv: 53 | out_data.update({in_row[args.key]: {}}) 54 | for field in infile_csv.fieldnames: 55 | if field != args.key and in_row[field] != '': 56 | out_data[in_row[args.key]].update({field: in_row[field]}) 57 | with open(json_file, 'wt') as json_file_obj: 58 | if args.human_readable: 59 | json_file_obj.write(json.dumps(out_data, indent=4)) 60 | else: 61 | json_file_obj.write(json.dumps(out_data)) 62 | del out_data 63 | 64 | 65 | if __name__ == '__main__': 66 | # populating queue 67 | csvQueue = multiprocessing.Queue() 68 | 69 | if os.path.isdir(args.path): 70 | for csv_f in os.listdir(args.path): 71 | if csv_f.endswith('.csv'): 72 | csvQueue.put(os.path.join(args.path, csv_f)) 73 | elif os.path.isfile(args.path) and args.path.endswith('.csv'): 74 | csvQueue.put(args.path) 75 | else: 76 | exit(1) 77 | 78 | threads = [] 79 | for t in range(0, args.threads): 80 | convert_thread = multiprocessing.Process(target=convert_json, args=(csvQueue, )) 81 | convert_thread.start() 82 | threads.append(convert_thread) 83 | for convert_thread in threads: 84 | convert_thread.join() 85 | -------------------------------------------------------------------------------- /whoisxmlapi_flexible_csv_to_mysqldb/field_types.csv: -------------------------------------------------------------------------------- 1 | domainName,varchar(256) 2 | registrarName,varchar(512) 3 | contactEmail,varchar(256) 4 | whoisServer,varchar(512) 5 | nameServers,varchar(256) 6 | createdDate,varchar(256) 7 | updatedDate,varchar(256) 8 | expiresDate,varchar(256) 9 | standardRegCreatedDate,varchar(256) 10 | standardRegUpdatedDate,varchar(256) 11 | standardRegExpiresDate,varchar(256) 12 | status,text 13 | RegistryData_rawText,longtext 14 | WhoisRecord_rawText,longtext 15 | Audit_auditUpdatedDate,varchar(256) 16 | registrant_rawText,longtext 17 | registrant_email,varchar(256) 18 | registrant_name,varchar(256) 19 | registrant_organization,varchar(256) 20 | registrant_street1,varchar(256) 21 | registrant_street2,varchar(256) 22 | registrant_street3,varchar(256) 23 | registrant_street4,varchar(256) 24 | registrant_city,varchar(64) 25 | registrant_state,varchar(256) 26 | registrant_postalCode,varchar(45) 27 | registrant_country,varchar(45) 28 | registrant_fax,varchar(45) 29 | registrant_faxExt,varchar(45) 30 | registrant_telephone,varchar(45) 31 | registrant_telephoneExt,varchar(45) 32 | administrativeContact_rawText,longtext 33 | administrativeContact_email,varchar(256) 34 | administrativeContact_name,varchar(256) 35 | administrativeContact_organization,varchar(256) 36 | administrativeContact_street1,varchar(256) 37 | administrativeContact_street2,varchar(256) 38 | administrativeContact_street3,varchar(256) 39 | administrativeContact_street4,varchar(256) 40 | administrativeContact_city,varchar(64) 41 | administrativeContact_state,varchar(256) 42 | administrativeContact_postalCode,varchar(45) 43 | administrativeContact_country,varchar(45) 44 | administrativeContact_fax,varchar(45) 45 | administrativeContact_faxExt,varchar(45) 46 | administrativeContact_telephone,varchar(45) 47 | administrativeContact_telephoneExt,varchar(45) 48 | billingContact_rawText,longtext 49 | billingContact_email,varchar(256) 50 | billingContact_name,varchar(256) 51 | billingContact_organization,varchar(256) 52 | billingContact_street1,varchar(256) 53 | billingContact_street2,varchar(256) 54 | billingContact_street3,varchar(256) 55 | billingContact_street4,varchar(256) 56 | billingContact_city,varchar(64) 57 | billingContact_state,varchar(256) 58 | billingContact_postalCode,varchar(45) 59 | billingContact_country,varchar(45) 60 | billingContact_fax,varchar(45) 61 | billingContact_faxExt,varchar(45) 62 | billingContact_telephone,varchar(45) 63 | billingContact_telephoneExt,varchar(45) 64 | technicalContact_rawText,longtext 65 | technicalContact_email,varchar(256) 66 | technicalContact_name,varchar(256) 67 | technicalContact_organization,varchar(256) 68 | technicalContact_street1,varchar(256) 69 | technicalContact_street2,varchar(256) 70 | technicalContact_street3,varchar(256) 71 | technicalContact_street4,varchar(256) 72 | technicalContact_city,varchar(64) 73 | technicalContact_state,varchar(256) 74 | technicalContact_postalCode,varchar(45) 75 | technicalContact_country,varchar(45) 76 | technicalContact_fax,varchar(45) 77 | technicalContact_faxExt,varchar(45) 78 | technicalContact_telephone,varchar(45) 79 | technicalContact_telephoneExt,varchar(45) 80 | zoneContact_rawText,longtext 81 | zoneContact_email,varchar(256) 82 | zoneContact_name,varchar(256) 83 | zoneContact_organization,varchar(256) 84 | zoneContact_street1,varchar(256) 85 | zoneContact_street2,varchar(256) 86 | zoneContact_street3,varchar(256) 87 | zoneContact_street4,varchar(256) 88 | zoneContact_city,varchar(64) 89 | zoneContact_state,varchar(256) 90 | zoneContact_postalCode,varchar(45) 91 | zoneContact_country,varchar(45) 92 | zoneContact_fax,varchar(45) 93 | zoneContact_faxExt,varchar(45) 94 | zoneContact_telephone,varchar(45) 95 | zoneContact_telephoneExt,varchar(45) 96 | registrarIANAID,varchar(45) 97 | -------------------------------------------------------------------------------- /whoisxmlapi_bash_csv_to_mysqldb/loader_schema_regular_daily_only.sql: -------------------------------------------------------------------------------- 1 | create table whois_record_flat_regular ( 2 | `whois_record_flat_id` bigint(20) NOT NULL AUTO_INCREMENT, 3 | `domainName` varchar(256), 4 | `registrarName` varchar(512), 5 | `contactEmail` varchar(256), 6 | `whoisServer` varchar(512), 7 | `nameServers` varchar(256), 8 | `createdDate` varchar(256), 9 | `updatedDate` varchar(256), 10 | `expiresDate` varchar(256), 11 | `standardRegCreatedDate` varchar(256), 12 | `standardRegUpdatedDate` varchar(256), 13 | `standardRegExpiresDate` varchar(256), 14 | `status` text, 15 | `Audit_auditUpdatedDate` varchar(256), 16 | `registrant_email` varchar(256), 17 | `registrant_name` varchar(256), 18 | `registrant_organization` varchar(256), 19 | `registrant_street1` varchar(256), 20 | `registrant_street2` varchar(256), 21 | `registrant_street3` varchar(256), 22 | `registrant_street4` varchar(256), 23 | `registrant_city` varchar(64), 24 | `registrant_state` varchar(256), 25 | `registrant_postalCode` varchar(45), 26 | `registrant_country` varchar(45), 27 | `registrant_fax` varchar(45), 28 | `registrant_faxExt` varchar(45), 29 | `registrant_telephone` varchar(45), 30 | `registrant_telephoneExt` varchar(45), 31 | `administrativeContact_email` varchar(256), 32 | `administrativeContact_name` varchar(256), 33 | `administrativeContact_organization` varchar(256), 34 | `administrativeContact_street1` varchar(256), 35 | `administrativeContact_street2` varchar(256), 36 | `administrativeContact_street3` varchar(256), 37 | `administrativeContact_street4` varchar(256), 38 | `administrativeContact_city` varchar(64), 39 | `administrativeContact_state` varchar(256), 40 | `administrativeContact_postalCode` varchar(45), 41 | `administrativeContact_country` varchar(45), 42 | `administrativeContact_fax` varchar(45), 43 | `administrativeContact_faxExt` varchar(45), 44 | `administrativeContact_telephone` varchar(45), 45 | `administrativeContact_telephoneExt` varchar(45), 46 | `billingContact_email` varchar(256), 47 | `billingContact_name` varchar(256), 48 | `billingContact_organization` varchar(256), 49 | `billingContact_street1` varchar(256), 50 | `billingContact_street2` varchar(256), 51 | `billingContact_street3` varchar(256), 52 | `billingContact_street4` varchar(256), 53 | `billingContact_city` varchar(64), 54 | `billingContact_state` varchar(256), 55 | `billingContact_postalCode` varchar(45), 56 | `billingContact_country` varchar(45), 57 | `billingContact_fax` varchar(45), 58 | `billingContact_faxExt` varchar(45), 59 | `billingContact_telephone` varchar(45), 60 | `billingContact_telephoneExt` varchar(45), 61 | `technicalContact_email` varchar(256), 62 | `technicalContact_name` varchar(256), 63 | `technicalContact_organization` varchar(256), 64 | `technicalContact_street1` varchar(256), 65 | `technicalContact_street2` varchar(256), 66 | `technicalContact_street3` varchar(256), 67 | `technicalContact_street4` varchar(256), 68 | `technicalContact_city` varchar(64), 69 | `technicalContact_state` varchar(256), 70 | `technicalContact_postalCode` varchar(45), 71 | `technicalContact_country` varchar(45), 72 | `technicalContact_fax` varchar(45), 73 | `technicalContact_faxExt` varchar(45), 74 | `technicalContact_telephone` varchar(45), 75 | `technicalContact_telephoneExt` varchar(45), 76 | `zoneContact_email` varchar(256), 77 | `zoneContact_name` varchar(256), 78 | `zoneContact_organization` varchar(256), 79 | `zoneContact_street1` varchar(256), 80 | `zoneContact_street2` varchar(256), 81 | `zoneContact_street3` varchar(256), 82 | `zoneContact_street4` varchar(256), 83 | `zoneContact_city` varchar(64), 84 | `zoneContact_state` varchar(256), 85 | `zoneContact_postalCode` varchar(45), 86 | `zoneContact_country` varchar(45), 87 | `zoneContact_fax` varchar(45), 88 | `zoneContact_faxExt` varchar(45), 89 | `zoneContact_telephone` varchar(45), 90 | `zoneContact_telephoneExt` varchar(45), 91 | `registrarIANAID` varchar(45), 92 | primary key (`whois_record_flat_id`) 93 | )ENGINE=InnoDB ROW_FORMAT=COMPRESSED AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; 94 | -------------------------------------------------------------------------------- /whoisxmlapi_bash_csv_to_mysqldb/loader_schema_regular.sql: -------------------------------------------------------------------------------- 1 | create table whois_record_flat_regular ( 2 | `whois_record_flat_id` bigint(20) NOT NULL AUTO_INCREMENT, 3 | `domainName` varchar(256), 4 | `registrarName` varchar(512), 5 | `contactEmail` varchar(256), 6 | `whoisServer` varchar(512), 7 | `nameServers` varchar(256), 8 | `createdDate` varchar(256), 9 | `updatedDate` varchar(256), 10 | `expiresDate` varchar(256), 11 | `standardRegCreatedDate` varchar(256), 12 | `standardRegUpdatedDate` varchar(256), 13 | `standardRegExpiresDate` varchar(256), 14 | `status` text, 15 | `Audit_auditUpdatedDate` varchar(256), 16 | `registrant_rawText` longtext, 17 | `registrant_email` varchar(256), 18 | `registrant_name` varchar(256), 19 | `registrant_organization` varchar(256), 20 | `registrant_street1` varchar(256), 21 | `registrant_street2` varchar(256), 22 | `registrant_street3` varchar(256), 23 | `registrant_street4` varchar(256), 24 | `registrant_city` varchar(64), 25 | `registrant_state` varchar(256), 26 | `registrant_postalCode` varchar(45), 27 | `registrant_country` varchar(45), 28 | `registrant_fax` varchar(45), 29 | `registrant_faxExt` varchar(45), 30 | `registrant_telephone` varchar(45), 31 | `registrant_telephoneExt` varchar(45), 32 | `administrativeContact_rawText` longtext, 33 | `administrativeContact_email` varchar(256), 34 | `administrativeContact_name` varchar(256), 35 | `administrativeContact_organization` varchar(256), 36 | `administrativeContact_street1` varchar(256), 37 | `administrativeContact_street2` varchar(256), 38 | `administrativeContact_street3` varchar(256), 39 | `administrativeContact_street4` varchar(256), 40 | `administrativeContact_city` varchar(64), 41 | `administrativeContact_state` varchar(256), 42 | `administrativeContact_postalCode` varchar(45), 43 | `administrativeContact_country` varchar(45), 44 | `administrativeContact_fax` varchar(45), 45 | `administrativeContact_faxExt` varchar(45), 46 | `administrativeContact_telephone` varchar(45), 47 | `administrativeContact_telephoneExt` varchar(45), 48 | `billingContact_rawText` longtext, 49 | `billingContact_email` varchar(256), 50 | `billingContact_name` varchar(256), 51 | `billingContact_organization` varchar(256), 52 | `billingContact_street1` varchar(256), 53 | `billingContact_street2` varchar(256), 54 | `billingContact_street3` varchar(256), 55 | `billingContact_street4` varchar(256), 56 | `billingContact_city` varchar(64), 57 | `billingContact_state` varchar(256), 58 | `billingContact_postalCode` varchar(45), 59 | `billingContact_country` varchar(45), 60 | `billingContact_fax` varchar(45), 61 | `billingContact_faxExt` varchar(45), 62 | `billingContact_telephone` varchar(45), 63 | `billingContact_telephoneExt` varchar(45), 64 | `technicalContact_rawText` longtext, 65 | `technicalContact_email` varchar(256), 66 | `technicalContact_name` varchar(256), 67 | `technicalContact_organization` varchar(256), 68 | `technicalContact_street1` varchar(256), 69 | `technicalContact_street2` varchar(256), 70 | `technicalContact_street3` varchar(256), 71 | `technicalContact_street4` varchar(256), 72 | `technicalContact_city` varchar(64), 73 | `technicalContact_state` varchar(256), 74 | `technicalContact_postalCode` varchar(45), 75 | `technicalContact_country` varchar(45), 76 | `technicalContact_fax` varchar(45), 77 | `technicalContact_faxExt` varchar(45), 78 | `technicalContact_telephone` varchar(45), 79 | `technicalContact_telephoneExt` varchar(45), 80 | `zoneContact_rawText` longtext, 81 | `zoneContact_email` varchar(256), 82 | `zoneContact_name` varchar(256), 83 | `zoneContact_organization` varchar(256), 84 | `zoneContact_street1` varchar(256), 85 | `zoneContact_street2` varchar(256), 86 | `zoneContact_street3` varchar(256), 87 | `zoneContact_street4` varchar(256), 88 | `zoneContact_city` varchar(64), 89 | `zoneContact_state` varchar(256), 90 | `zoneContact_postalCode` varchar(45), 91 | `zoneContact_country` varchar(45), 92 | `zoneContact_fax` varchar(45), 93 | `zoneContact_faxExt` varchar(45), 94 | `zoneContact_telephone` varchar(45), 95 | `zoneContact_telephoneExt` varchar(45), 96 | `registrarIANAID` varchar(45), 97 | primary key (`whois_record_flat_id`) 98 | )ENGINE=InnoDB ROW_FORMAT=COMPRESSED AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; 99 | -------------------------------------------------------------------------------- /whoisxmlapi_bash_csv_to_mysqldb/loader_schema_full.sql: -------------------------------------------------------------------------------- 1 | create table whois_record_flat ( 2 | `whois_record_flat_id` bigint(20) NOT NULL AUTO_INCREMENT, 3 | `domainName` varchar(256), 4 | `registrarName` varchar(512), 5 | `contactEmail` varchar(256), 6 | `whoisServer` varchar(512), 7 | `nameServers` varchar(256), 8 | `createdDate` varchar(256), 9 | `updatedDate` varchar(256), 10 | `expiresDate` varchar(256), 11 | `standardRegCreatedDate` varchar(256), 12 | `standardRegUpdatedDate` varchar(256), 13 | `standardRegExpiresDate` varchar(256), 14 | `status` text, 15 | `RegistryData_rawText` longtext, 16 | `WhoisRecord_rawText` longtext, 17 | `Audit_auditUpdatedDate` varchar(256), 18 | `registrant_rawText` longtext, 19 | `registrant_email` varchar(256), 20 | `registrant_name` varchar(256), 21 | `registrant_organization` varchar(256), 22 | `registrant_street1` varchar(256), 23 | `registrant_street2` varchar(256), 24 | `registrant_street3` varchar(256), 25 | `registrant_street4` varchar(256), 26 | `registrant_city` varchar(64), 27 | `registrant_state` varchar(256), 28 | `registrant_postalCode` varchar(45), 29 | `registrant_country` varchar(45), 30 | `registrant_fax` varchar(45), 31 | `registrant_faxExt` varchar(45), 32 | `registrant_telephone` varchar(45), 33 | `registrant_telephoneExt` varchar(45), 34 | `administrativeContact_rawText` longtext, 35 | `administrativeContact_email` varchar(256), 36 | `administrativeContact_name` varchar(256), 37 | `administrativeContact_organization` varchar(256), 38 | `administrativeContact_street1` varchar(256), 39 | `administrativeContact_street2` varchar(256), 40 | `administrativeContact_street3` varchar(256), 41 | `administrativeContact_street4` varchar(256), 42 | `administrativeContact_city` varchar(64), 43 | `administrativeContact_state` varchar(256), 44 | `administrativeContact_postalCode` varchar(45), 45 | `administrativeContact_country` varchar(45), 46 | `administrativeContact_fax` varchar(45), 47 | `administrativeContact_faxExt` varchar(45), 48 | `administrativeContact_telephone` varchar(45), 49 | `administrativeContact_telephoneExt` varchar(45), 50 | `billingContact_rawText` longtext, 51 | `billingContact_email` varchar(256), 52 | `billingContact_name` varchar(256), 53 | `billingContact_organization` varchar(256), 54 | `billingContact_street1` varchar(256), 55 | `billingContact_street2` varchar(256), 56 | `billingContact_street3` varchar(256), 57 | `billingContact_street4` varchar(256), 58 | `billingContact_city` varchar(64), 59 | `billingContact_state` varchar(256), 60 | `billingContact_postalCode` varchar(45), 61 | `billingContact_country` varchar(45), 62 | `billingContact_fax` varchar(45), 63 | `billingContact_faxExt` varchar(45), 64 | `billingContact_telephone` varchar(45), 65 | `billingContact_telephoneExt` varchar(45), 66 | `technicalContact_rawText` longtext, 67 | `technicalContact_email` varchar(256), 68 | `technicalContact_name` varchar(256), 69 | `technicalContact_organization` varchar(256), 70 | `technicalContact_street1` varchar(256), 71 | `technicalContact_street2` varchar(256), 72 | `technicalContact_street3` varchar(256), 73 | `technicalContact_street4` varchar(256), 74 | `technicalContact_city` varchar(64), 75 | `technicalContact_state` varchar(256), 76 | `technicalContact_postalCode` varchar(45), 77 | `technicalContact_country` varchar(45), 78 | `technicalContact_fax` varchar(45), 79 | `technicalContact_faxExt` varchar(45), 80 | `technicalContact_telephone` varchar(45), 81 | `technicalContact_telephoneExt` varchar(45), 82 | `zoneContact_rawText` longtext, 83 | `zoneContact_email` varchar(256), 84 | `zoneContact_name` varchar(256), 85 | `zoneContact_organization` varchar(256), 86 | `zoneContact_street1` varchar(256), 87 | `zoneContact_street2` varchar(256), 88 | `zoneContact_street3` varchar(256), 89 | `zoneContact_street4` varchar(256), 90 | `zoneContact_city` varchar(64), 91 | `zoneContact_state` varchar(256), 92 | `zoneContact_postalCode` varchar(45), 93 | `zoneContact_country` varchar(45), 94 | `zoneContact_fax` varchar(45), 95 | `zoneContact_faxExt` varchar(45), 96 | `zoneContact_telephone` varchar(45), 97 | `zoneContact_telephoneExt` varchar(45), 98 | `registrarIANAID` varchar(45), 99 | primary key (`whois_record_flat_id`) 100 | )ENGINE=InnoDB ROW_FORMAT=COMPRESSED AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; 101 | -------------------------------------------------------------------------------- /whoisxmlapi_csv2json/transform_json_verbose.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Python3 script for converting WhoisXML API csv files 4 | into JSON. 5 | This is the less portable version for python3 with verbose output. 6 | """ 7 | 8 | from argparse import ArgumentParser 9 | import csv 10 | import multiprocessing 11 | import json 12 | import sys 13 | import os 14 | from platform import system 15 | import tqdm 16 | 17 | VERSION = "0.0.2" 18 | MYNAME = sys.argv[0].replace('./', '') 19 | 20 | # Preparing arguments 21 | argparser = ArgumentParser(description='Convert WhoisXML API CSV files to JSON format.') 22 | argparser.add_argument('--version', 23 | help='Print version information and exit.', 24 | action='version', 25 | version=MYNAME + ' ver. ' + VERSION + '\n(c) WhoisXML API LLC.') 26 | argparser.add_argument('-i', '--path', 27 | help='input directory with uncompressed CSVs or single CSV file', 28 | type=str, required=True) 29 | argparser.add_argument('--key', help='primary key field for records, default "domainName"', 30 | type=str, default='domainName') 31 | argparser.add_argument('--threads', 32 | help='number of threads, default 1', 33 | type=int, default=1) 34 | argparser.add_argument('--force', help='overwrite existent files', action='store_true') 35 | argparser.add_argument('--human-readable', 36 | help='generate human readable output', 37 | action='store_true') 38 | argparser.add_argument('--no-progress', help='disable progress indicator', action='store_true') 39 | argparser.add_argument('--quiet', help='suppress all output', action='store_true') 40 | 41 | args = argparser.parse_args() 42 | 43 | # increase max size of the field 44 | if system() == 'Windows': 45 | csv.field_size_limit(2147483647) 46 | else: 47 | from sys import maxsize as csv_maxsize 48 | csv.field_size_limit(csv_maxsize) 49 | 50 | def print_verbose(text): 51 | """print messages if not in quiet mode""" 52 | if not args.quiet: 53 | print(text) 54 | 55 | def convert_json(csv_queue): 56 | """the actual job, done by each thread""" 57 | while not csv_queue.empty(): 58 | csv_file = csv_queue.get() 59 | json_file = os.path.join( 60 | os.path.dirname(csv_file), 61 | os.path.basename(csv_file).replace('.csv', '.json')) 62 | if args.force or not os.path.isfile(json_file): 63 | out_data = dict() 64 | with tqdm.tqdm(0, unit=' records', disable=args.no_progress) as pbar: 65 | pbar.set_description("Processing %s" % csv_file) 66 | with open(csv_file, 'rt') as infile: 67 | infile_csv = csv.DictReader(infile) 68 | for in_row in infile_csv: 69 | pbar.update(1) 70 | out_data.update({in_row[args.key]: {}}) 71 | for field in infile_csv.fieldnames: 72 | if field != args.key and in_row[field] != '': 73 | out_data[in_row[args.key]].update({field: in_row[field]}) 74 | with open(json_file, 'wt') as json_file_obj: 75 | print_verbose("Writing %s" % (json_file)) 76 | if args.human_readable: 77 | json_file_obj.write(json.dumps(out_data, indent=4)) 78 | else: 79 | json_file_obj.write(json.dumps(out_data)) 80 | del out_data 81 | 82 | 83 | if __name__ == '__main__': 84 | # populating queue 85 | csvQueue = multiprocessing.Queue() 86 | 87 | if os.path.isdir(args.path): 88 | for csv_f in os.listdir(args.path): 89 | if csv_f.endswith('.csv'): 90 | csvQueue.put(os.path.join(args.path, csv_f)) 91 | elif os.path.isfile(args.path) and args.path.endswith('.csv'): 92 | csvQueue.put(args.path) 93 | else: 94 | exit(1) 95 | 96 | threads = [] 97 | for t in range(0, args.threads): 98 | convert_thread = multiprocessing.Process(target=convert_json, args=(csvQueue, )) 99 | convert_thread.start() 100 | threads.append(convert_thread) 101 | for convert_thread in threads: 102 | convert_thread.join() 103 | -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/README.SSL.txt: -------------------------------------------------------------------------------- 1 | Important note 2023-06-07 2 | 3 | SSL authentication is obsolete and not anymore supported. 4 | With --disable-ssl-verification your creds may work, but use it at your own risk. 5 | 6 | It is recommended to prefer regular password auth. 7 | 8 | ---------------------------legacy document follows 9 | install_p12.py 10 | 11 | Setting up ssl authentication to use with download_whois_data.py 12 | 13 | Provided by WhoisXML API, Inc. 14 | 15 | dated: 2018-12-10 16 | updated: 2020-10-02 17 | 18 | Contents: 19 | --------- 20 | 21 | 1. Brief summary 22 | 23 | 2. Prerequisites 24 | 25 | 3. Installing the auth credentials 26 | 27 | 4. Using the downloader script 28 | 29 | 1. Brief summary 30 | ---------------- 31 | 32 | The goal is to set up your authentication credentials so that you can 33 | use download_whois_data.py with the option --sslauth instead of 34 | password authentication. 35 | 36 | 2. Prerequisites 37 | ---------------- 38 | 39 | You will need a p12 pack, a file provided to you by WhoisXML API, 40 | Inc. You need a password you have received in conjunction with this 41 | file. (This is the file imported to your system or your browser in 42 | order to facilitate ssl auth.) 43 | 44 | To be used with the downloader script, this file has to be converted 45 | into three other files suitable for the script. 46 | 47 | We remark that the files are the same as those used by the bash 48 | downloader (in the whoisdownload_bash subdirectory of the git repo 49 | https://github.com/whois-api-llc/whois_database_download_support. So 50 | you may choose to run the bash script for the conversion, which is 51 | described in the file "README.SSL" in that project. You may choose to 52 | copy the files whoisxmlapi.ca, client.crt and client.key generated 53 | with the bash converter next to download_whois_data.py. If you have 54 | done so, you do not need to reed further, you can use SSL auth 55 | readily. 56 | 57 | So let us assume that given your p12 pack and password, you want to 58 | generate the three required files with the Python script 59 | 60 | install_p12.py. 61 | 62 | In addition to the prerequisites of download_whois_data.py (described 63 | in the main README of the project, you will need two additional Python 64 | packages. So as a root, or in an administrator command-prompt on 65 | native Windows systems (prepared to run download_whois_data.py), do 66 | 67 | pip3 install pyOpenSSL pycryptodome 68 | 69 | 70 | if you are using series 3 Python. 71 | 72 | On older Python3 versions the legacy pycrypto package may also work: 73 | 74 | pip3 install pyOpenSSL pycrypto 75 | 76 | whereas if you use series 2 Python, do 77 | 78 | pip2 install pycrypto pyopenssl 79 | 80 | to install the requirements. Alternatively you may search for the 81 | required packages in your operating system's package manager, or 82 | create a Python virtual environment. 83 | 84 | 85 | 2. Installing the auth credentials 86 | ---------------------------------- 87 | 88 | You need to do the following just once: 89 | 90 | -Put your p12 pack file into the directory where the download scripts 91 | (and this file) reside. 92 | 93 | -Run the following command in the directory: 94 | 95 | install_p12.py 96 | 97 | (either from a shell command-line or by double-clicking it on 98 | Windows). A series of dialog windows will guide you through the 99 | simple process: 100 | 101 | -You have to select the pack.p12 file 102 | 103 | -Then you will be prompted for the password. 104 | 105 | If everything went right, a dialog window will appear informing you 106 | that the files are prepared and you can use the script. If an error 107 | occurs, you shall be informed about it in a window: check the files 108 | and the password and run the script again in this case. 109 | 110 | The files generated by the script are client.crt, client.key and 111 | whoisxmlapi.ca needed for the authentication and they are placed 112 | next to the script by default. The key file will be only readable by 113 | you. 114 | 115 | IMPORTANT: keep the generated client.key confidential. 116 | 117 | 3. Using the script 118 | ------------------- 119 | 120 | To use ssl authentication instead of password authentication, add 121 | 122 | --sslauth 123 | 124 | to the options of the script when you use it. You will not need the 125 | --user and --password options then, and the script will ignore the 126 | password configuration ini file. 127 | -------------------------------------------------------------------------------- /whoisxmlapi_csv2json/README: -------------------------------------------------------------------------------- 1 | Documentation for the WhoisXML API 2 | 3 | WHOIS CSV to JSON converter scripts 4 | 5 | Copyright (c) 2010-2021 Whois API LLC, http://www.whoisxmlapi.com 6 | ------------------------------------------------------------------- 7 | 8 | The scripts are provided for our subscribers. 9 | 10 | The aim of the script is to convert WHOIS data downloaded in CSV 11 | format to JSON. 12 | 13 | It is a cross-platform solution for end-users. It should be used on 14 | Windows and Linux/Unix type systems and does not require special 15 | experience. 16 | 17 | The user should be familiar, however, with the feeds and data formats, 18 | which are described in the reference manuals of the respective feeds. 19 | 20 | Script availability: 21 | -------------------- 22 | 23 | The primary location of this script is the public GitHub repository 24 | 25 | https://github.com/whois-api-llc/whois_database_download_support 26 | 27 | The scripts are located in the subdirectory 28 | 29 | whoisxmlapi_csv2json 30 | 31 | Contents 32 | -------- 33 | 34 | 1. Quickstart 35 | 2. Options of transform_json.py 36 | 3. A less portable verbose version: transform_json_verbose.py 37 | 4. Output file format 38 | 39 | 1. Quickstart 40 | ------------- 41 | 42 | The fastest way of using the script is the following: 43 | 44 | - Make sure you have either Python 2 (tested with 2.7.10) or Python 3 45 | installed and working on your system. 46 | 47 | - Install the following Python packages: 48 | 49 | argparse, csv, multiprocessing, json 50 | 51 | You can do it with pip, Python's package manager 52 | ("pip install " in a command-line) 53 | or with the package manager of your system. 54 | 55 | - Download "simple", "regular" or "full" CSV files from WhoisXML API 56 | data feeds. You can use this script with files from quarterly 57 | databases as well as daily feeds. 58 | 59 | Consult the documentation of WhoisXML API data download products 60 | for more information. The manuals are available from 61 | 62 | http://www.domainwhoisdatabase.com/docs/ 63 | 64 | - Having your CSV files in a given directory, say "foo/", you can 65 | convert them to JSON by using the script in command-line the 66 | following way: 67 | 68 | transform_json.py -i foo/ 69 | 70 | The script will not produce any output. 71 | Depending on the number of files, the process may take a 72 | longer time. 73 | 74 | The JSON files will be next to their CSV counterparts with the same 75 | basename. 76 | 77 | 2. Options of transform_json.py 78 | ------------------------------- 79 | 80 | The script is self-documenting, you can obtain the description with 81 | 82 | transform_json.py --help 83 | 84 | The output lists the options: 85 | --- 86 | usage: transform_json.py [-h] [--version] -i PATH [--key KEY] 87 | [--threads THREADS] [--force] [--human-readable] 88 | 89 | Convert CSV to JSON format 90 | 91 | optional arguments: 92 | -h, --help show this help message and exit 93 | --version Print version information and exit. 94 | -i PATH, --path PATH input directory with uncompressed CSVs or single CSV 95 | file 96 | --key KEY primary key field for records, default "domainName" 97 | --threads THREADS number of threads, default 1 98 | --force overwrite existent files 99 | --human-readable generate human readable output 100 | 101 | -- 102 | Comments: 103 | 104 | - The script supports multi-threaded operation with the --threads option. 105 | - The --human-readable option results in JSON files well-readable as text. 106 | - See also the description of the output file format in Section 107 | 108 | 3. A less portable verbose version: transform_json_verbose.py 109 | ------------------------------------------------------------- 110 | 111 | If you want to follow the progress of the conversion, there is another 112 | script available under the name "transform_json_verbose.py". 113 | 114 | It requires Python 3 and an additional python package, "tqdm", to work. 115 | 116 | In addition to the options of transform_json.py, this script produces 117 | a verbose output by default: 118 | 119 | -A progress bar showing the status of the reading of files. 120 | 121 | -A message about writing JSON files. 122 | 123 | These can be suppressed by the 124 | 125 | --no-progress 126 | 127 | and 128 | 129 | --quiet 130 | 131 | options respectively. 132 | 133 | Apart from this, the operation of the script is the same as that of 134 | transform_json.py. 135 | 136 | 4. Output file format 137 | --------------------- 138 | 139 | The resulting file contains a single JSON string. Within this at first 140 | level, there is each record as a value in a key-value pair, where the 141 | key is the filed specified by the --key option of the scripts, the 142 | domain name by default. In the value, the non-empty fields of the 143 | record appear as key-value pairs. 144 | 145 | If the --human-readable option was set, the file contains proper 146 | indentations and newlines to be well readable as plain text. Without 147 | the option, a JSON file for machine processing is obtained. 148 | -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/FAQ.txt: -------------------------------------------------------------------------------- 1 | FAQ.txt for 2 | 3 | download_whois_data.py 4 | 5 | Copyright (c) 2010-2021 Whois API LLC, http://www.whoisxmlapi.com 6 | ------------------------------------------------------------------- 7 | 8 | Q1: I obtained an error stating that my login and password are 9 | invalid. I can access the page with my browser. What happened? 10 | 11 | A1: It might be the case that you have specified a wrong date or 12 | database version. E.g. you chose v20 of quarterly cctld feeds when the 13 | last version is v8. As the script does not have data on the available 14 | db versions, it derives file locations from the data you have 15 | specified, but these locations do not exist on our server. For 16 | security reasons the server will report invalid login credentials, so 17 | it will appear as if there was something wrong with your password. 18 | 19 | Please double-check the feed name and the parameters you are using. 20 | -------------------------------------------------------------------- 21 | 22 | Q2: The script says it cannot determine the list of supported 23 | tlds. Why? 24 | 25 | A2: In spite of all of our efforts it may happen that the necessary 26 | supported_tlds file is missing. Please contact support in this case. 27 | 28 | -------------------------------------------------------------------- 29 | 30 | Q3: The script reports files which could not have been 31 | downloaded. When shall they be available? 32 | 33 | A3: It may happen that some files derived according to the naming 34 | logic of a given feed do not exist at the time of downloading. One 35 | reason might be that the file is not yet prepared when you run the 36 | script. If you re-run the script later with the same parameters, it 37 | will not redownload files which are already there and have not been 38 | changed, but it shall find the missing ones. It may also occur that 39 | the file will never exist. In feeds devoted to changes 40 | (domain_names_new, domain_names_dropped), for instance, it might 41 | happen that there were no changes in the data the given day. We do not 42 | store empty files, so these files will be reported as unavailable, but 43 | this is normal. 44 | 45 | -------------------------------------------------------------------- 46 | 47 | Q4: I do not want the script to check all supported TLDs with the 48 | --all-tlds option in case of daily feeds; I want it to try downloading 49 | only for those TLDs in which there was a change on the given day and 50 | thus the data file exists. 51 | 52 | A4: Using the --only-changed option will result in the desired 53 | behavior. It does not work for all daily feeds, e.g. "delta" feeds do 54 | not support it, but "new" and "dropped" feeds do. If downloading for 55 | multiple days, an attempt will be made to download data for all the 56 | TLDs which had a change on at least one of the days. 57 | 58 | -------------------------------------------------------------------- 59 | 60 | Q5: So far I have been using "download_whois_info.py" which I had 61 | downloaded for a release a few years ago. I've just realized that it 62 | is not supported anymore and it has been replaced by 63 | "download_whois_data". I decided to switch the new script, but the 64 | options of the script are different and not compatible. Why? Can you 65 | make it compatible? 66 | 67 | A5: We decided to redesign the python downloader in 2017 because 68 | "download_whois_info.py" was not scalable; initially it was intended 69 | as a small example script but the requirements against a downloader 70 | script went far beyond the original idea, so a redesign was 71 | unavoidable. The current script has many options and much more 72 | capabilities than the legacy one. So supporting the legacy and rather 73 | illogical options would lead tho an extremely large number of 74 | options. In addition the operation logic behind the new script is also 75 | different, it is not always possible to map the new options to the old 76 | ones. So it will not be made compatible, but the command-line can be 77 | easily rewritten along the following lines: 78 | 79 | - old script option: -c 80 | use --feed instead 81 | 82 | - old script option: -l or --login: 83 | use --username instead 84 | 85 | - old script option: -p or --password: 86 | use --password 87 | 88 | - old script option: -d or --date, with YYYY-MM-DD: 89 | use --startdate with YYYYMMDD 90 | 91 | - old script option: --end-date, with YYYY-MM-DD: 92 | use --enddate with YYYYMMDD 93 | 94 | - old script option: -v or --version: 95 | use --dbversion instead 96 | 97 | - old script option: -t, --tld, possibly with a space-separated list 98 | use --tlds, for more tlds, use a comma-separated list. 99 | It has no default value. For all tlds, use the --all-tlds options 100 | 101 | - old script option: -f 102 | Use --dataformats instead, a comma-separated list for all formats. 103 | There is no "all" possibility. Use the --list-dataformats to list 104 | the available formats for each feed. 105 | 106 | - old script option: -odir or --output-dir 107 | Use --output-dir. It has no default value, hence, it is a mandatory 108 | argument now. 109 | 110 | - old script option: --interactive 111 | Now it invokes the GUI mode of the script. 112 | The old function of --interactive is not supported. 113 | Consult the documentation on how to influence 114 | redownloading behavior, especially the --maxtries option 115 | 116 | - old script option: --no-override 117 | No such option. Currently if a file is there, 118 | it will not be downloaded again unless its md5sum 119 | does not match the file. To redownload, 120 | delete the respective file. 121 | 122 | 123 | -------------------------------------------------------------------------------- /whoisxmlapi_percona_loader_scripts/README.txt: -------------------------------------------------------------------------------- 1 | Documentation for WhoisXML API 2 | MySQL binary dump loader scripts 3 | 4 | Document version 1.0 dated 2017-07-24 5 | 6 | Copyright (c) 2010-2021 Whois API LLC, http://www.whoisxmlapi.com 7 | 8 | The scripts are provided for our subscribers to load binary mysql dumps 9 | obtained from our quarterly feeds into MySQL databases. The scripts 10 | can be used also as an example to create custom loader scripts. 11 | 12 | Script availability: 13 | -------------------- 14 | 15 | The primary location of this script is the public GitHub repository 16 | 17 | https://github.com/whois-api-llc/whois_database_download_support 18 | 19 | The script is located in the subdirectory 20 | 21 | whoisxmlapi_percona_loaders 22 | 23 | Contents: 24 | --------- 25 | 26 | 1. List of files 27 | 28 | 2. Obtaining data 29 | 30 | 3. Software environment 31 | 32 | 4. Using the script 33 | 34 | 1. List of files: 35 | ---------------- 36 | 37 | README : this file 38 | load_mysql_utils.sh : utility functions used in all scripts 39 | This should be there in the same 40 | directory as the script itself. 41 | load_whois_percona.sh: : The script to run. 42 | whoiscrawler_mysql_schema.sql : The schema file needed by the script. 43 | By default it should be in the same directory 44 | as the script. 45 | legacy : a directory containing legacy versions of 46 | the script which were in use before July 2017. 47 | 48 | 2. Obtaining data 49 | ----------------- 50 | 51 | Data files which can be loaded by these scripts can be obtained from 52 | 53 | http://domainwhoisdatabase.com/whois_database/v20/database_dump/percona 54 | 55 | (replace v20 by the actual version) 56 | 57 | and for cctlds from 58 | 59 | http://www.domainwhoisdatabase.com/domain_list_quarterly/v6/database_dump/percona/ 60 | 61 | (replace v6 by the actual version) 62 | 63 | 3. Software environment 64 | ----------------------- 65 | 66 | The present version was tested with 67 | 68 | mysql Ver 14.14 Distrib 5.7.18 69 | 70 | and 71 | 72 | GNU bash, 4.3.48(1)-release 73 | 74 | on a machine running Ubuntu Linux 16.4.02 LTS. 75 | 76 | The scripts are standard ones which have to work with earlier versions 77 | of bash also on other systems (Linux, Mac OS X, and Windows). It 78 | should be compatible with other version of MySQL, too. 79 | 80 | If you run into an incompatibility, please contact our support. 81 | 82 | 4. Using the script 83 | ------------------- 84 | 85 | Step 1. : obtain data 86 | ..................... 87 | We assume that you are working in the directory where this scripts and 88 | the files listed in Section 1. reside. 89 | 90 | Create a subdirectory for the data to be downloaded, say "whois_data" 91 | 92 | Download the data from 93 | 94 | http://domainwhoisdatabase.com/whois_database/v20/database_dump/percona/ 95 | 96 | (please replace v20 with the database version you are using) 97 | 98 | into this directory. You need the files $tld.7z for the tld-s you are 99 | interested in. You can use the provided md5 and sha sums to verify 100 | your downloads. 101 | 102 | Assume now, that you are interested in the domains "aaa" and "aarp", 103 | so you have "aaa.7z" and "aarp.7z" in the directory "whois_data". 104 | 105 | Step 2. Verify your files 106 | ......................... 107 | This step can be omitted, but it is recommended to do it. 108 | Run the following command-line in the script's directory: 109 | 110 | ./load_whois_percona.sh --import-data-dir=whois_data --tlds=aaa,aarp --db-version=v20 --verbose --dry-run 111 | 112 | (--tlds should be replaced by the comma-separated list of tld-s you 113 | are interested in, and you have to provide the version, v20 in our case. 114 | --dry-run ensures that the script will not yet do anything with MySQL) 115 | 116 | If the script does not report any error, you have all the required 117 | data files. Notice also that the script has extracted the 7zipped data. 118 | 119 | Step 3. Verify your database 120 | ............................ 121 | 122 | Please verify that the databases named "whoiscrawler_$dbver_$tld" do 123 | not yet exist in your mysql. If they exist, please drop them. 124 | 125 | Verify that you have a user in MySQL who can create tables, etc. The 126 | easiest way is to use the root user. If you set up ~/my.cnf so that 127 | the root user logs in without a password when issuing the "mysql" 128 | command, you will not need to specify the mysql user and password. 129 | 130 | In case of large domains, it is also recommended to make the fine 131 | tuning settings on your database as described in the Reference Manual 132 | of the database release. 133 | 134 | Make sure that your mysql server stores its data in /var/lib/mysql . 135 | If it stores them in some other directory, you will have to add the 136 | --mysql-data-dir=DIRECTORY option to the command-line in the next 137 | step, where DIRECTORY is the respective directory of your server. 138 | 139 | During the load process the script has to restart your mysql server 140 | several times. You are supposed to run it with a superuser so this 141 | should be possible. The mysql stop and start commands are configured 142 | in lines 25 and 26 of the script. By default we provide a standard 143 | Linux setting which is the default of most Linux systems (and other 144 | System V type UNIX-like systems). If you use an other platform, please 145 | customize these lines (e.g. "net stop MySQL57" and "net start MySQL57" 146 | on Windows.) 147 | 148 | Step 4. Load the data 149 | ..................... 150 | 151 | To load your data, do 152 | 153 | sudo ./load_whois_percona.sh --import-data-dir=whois_data --tlds=aaa,aarp --db-version=v20 --verbose 154 | 155 | You need to have write permission to mysql-s directory to succeed. The 156 | easiest way is to run your script as root, e.g. with sudo, as above. 157 | You may set up some less risky way to do it without sudo, however. 158 | 159 | If your ~/my.cnf is not set up to enable the root user (or some other 160 | user with database creation permissions), please use the --mysql-user 161 | and --mysql-password options, too, in order to specify the required 162 | username and password. 163 | 164 | You will now have the data loaded into the respective databases. 165 | -------------------------------------------------------------------------------- /whoisxmlapi_mysqldump_loaders/README: -------------------------------------------------------------------------------- 1 | Documentation for the WhoisXML API 2 | 3 | Documentation for WhoisXML API 4 | MySQL ASCII dump loader scripts 5 | (BASH version) 6 | 7 | (Note: there is a separate file named README_Python.txt for Python 8 | scripts. If you do not plan to work with huge domains such as .com or 9 | you are using a Windows system, we recommend to first check the Python 10 | scripts which contain a simple GUI and they are easier to use than the 11 | bash shell scripts.) 12 | 13 | Document version 1.0 dated 2017-07-14 14 | 15 | Copyright (c) 2010-2021 Whois API LLC, http://www.whoisxmlapi.com 16 | 17 | The scripts are provided for our subscribers to load ASCII mysql dumps 18 | obtained from our quarterly feeds into MySQL RDBMS to set up a WHOIS 19 | database. The scripts can be used also as an example to create custom 20 | loader scripts. 21 | 22 | 23 | Script availability: 24 | -------------------- 25 | 26 | The primary location of this script is the public GitHub repository 27 | 28 | https://github.com/whois-api-llc/whois_database_download_support 29 | 30 | The script is located in the subdirectory 31 | 32 | whoisxmlapi_mysql_loaders 33 | 34 | 35 | Contents: 36 | --------- 37 | 38 | 1. List of files 39 | 40 | 2. Obtaining data 41 | 42 | 3. Software environment 43 | 44 | 4. Loading schema and table data from separate files 45 | 46 | 5. Loading schema and table data from a single file 47 | 48 | 6. Loading multiple tlds and other usage examples 49 | 50 | 1. List of files: 51 | ---------------- 52 | 53 | README : this file 54 | load_mysql_utils.sh : utility functions used in all scripts 55 | This should be there in the same 56 | directory as the scripts themselves 57 | load_mysql_data_per_tables.sh: script to load schema first, then the data 58 | from separate files 59 | load_mysql_data_all.sh : script to load single-file backups 60 | 61 | legacy : a directory containing legacy versions of 62 | these scripts which were in use before July 2017. 63 | README_Python.txt : Documentation of the Python scripts for the same task 64 | load_mysq_data.py : Python (series 2) script for the same task. 65 | Documented separately in the file README_Python.txt 66 | 67 | 2. Obtaining data 68 | ----------------- 69 | 70 | Sample data can be obtained from 71 | 72 | http://domainwhoisdatabase.com/whois_database/sample/gtlds/v20/mysqldump_sample 73 | 74 | (replace v20 by the actual version) 75 | 76 | Production data can be obtained from 77 | 78 | http://domainwhoisdatabase.com/whois_database/v20/database_dump/mysqldump/ 79 | 80 | (replace v20 by the actual version) 81 | 82 | Single-file backups of production data can be also downloaded with our downloader scripts. Example: 83 | 84 | ./whoisdownload.sh --verbose --user my_username --password my_password --db-version v20 --data-feeds whois_database --tld "aaa" --file-format sql --output-dir=testdir 85 | 86 | We refer to the documentation of the downloader scripts for further details. 87 | 88 | 3. Software environment 89 | ----------------------- 90 | 91 | The present version was tested with 92 | 93 | mysql Ver 14.14 Distrib 5.7.18 94 | 95 | and 96 | 97 | GNU bash, 4.3.48(1)-release 98 | 99 | on a machine running Ubuntu Linux 16.4.02 LTS. 100 | 101 | The scripts are standard ones which have to work with earlier versions 102 | of bash also on other systems (Linux, Mac OS X, and Windows). It 103 | should be compatible with other version of MySQL, too. 104 | 105 | If you run into an incompatibility, please contact our support. 106 | 107 | 4. Loading schema and table data from separate files 108 | ---------------------------------------------------- 109 | 110 | Note: in the tasks described here and in Section 2.4 the syntax of the 111 | used scripts is the same if you use the --tld option. 112 | 113 | In the following example we plan to create our MySQL table for the tld 114 | "aaa" loading the schema first, then the data. 115 | 116 | This approach is recommended for large tlds such as "com". In such 117 | cases we also recommend to use the --show-progress option which draws 118 | a progress bar showing the status of loading of each file and an 119 | estimated time when it will be ready. Note that e.g. loading of the 120 | data of the "com" domain will take several days, so it is important to 121 | follow what is going on. 122 | 123 | As an input we need the files 124 | 125 | whoiscrawler_$version_$tld_mysql_schema.sql.gz 126 | 127 | (e.g. whoiscrawler_v20_aaa_mysql_schema.sql.gz) and the "tables" 128 | subdirectory in the same directory as this file. 129 | 130 | The script to be used is load_mysql_data_per_tables.sh 131 | 132 | run the script with the --help option to see the parameters and 133 | examples to load your data (the examples in the help message scripts 134 | are for loading the data of the "aaa" domain). 135 | 136 | Note: the script has three options which can be used to do the job 137 | partially, or the whole in multiple steps: 138 | 139 | --no-create-db skips the step of creating a new database. 140 | In this case the script assumes that the MySQL database 141 | to be used already exists. 142 | --data-only skips the loading of the schema, 143 | only loads data into the database. 144 | The database is assumed to exist in this case, too. 145 | --schema-only Loads the schema only (and creates the database without 146 | --no-create-db). The data are not loaded. 147 | 148 | 149 | 5. Loading single-file backups 150 | ------------------------------ 151 | 152 | Note: in the tasks described here and in Section 2.3 the syntax of the 153 | used scripts is the same if you use the --tld option. 154 | 155 | In the following example we plan to create our MySQL table for the tld 156 | "aaa" loading its single-file backup 157 | 158 | As an input we need the file 159 | 160 | whoiscrawler_$version_$tld_mysql.sql.gz 161 | 162 | E.g. whoiscrawler_v20_aaa_mysql.sql.gz . 163 | 164 | The script to be used is load_mysql_data_all.sh 165 | 166 | run the script with the --help option to see the parameters and 167 | examples to load your data (the examples in the help message scripts 168 | are for loading the data of the "aaa" domain). 169 | 170 | 6 Loading data for multiple tlds 171 | -------------------------------- 172 | 173 | Assume you have downloaded data for the following tlds from the v20 174 | quarterly release: 175 | 176 | asia,us,biz,mobi,info,org,net,com 177 | 178 | and you have placed the data into a subdirectories 179 | 180 | database_dump/mysqldump/$tld 181 | 182 | of the directory where the scripts reside. 183 | 184 | Assume you want to load them into databases named "production_db_$tld" 185 | with your mysql user "whoisuser" who has the password "whoispassword". 186 | 187 | To load them all so that you load the schema first then data from 188 | tables, the following command-line will do the job in bash: 189 | 190 | for tld in asia us biz mobi info org net com; do ./load_mysql_data_per_tables.sh --mysql-database=production_db_$tld --mysql-user=whoisuser --mysql-password=whoispassword --schema-files-dir=database_dump/mysqldump --tld=$tld --db-version=v20;done 191 | 192 | Alternatively, you may load the data from single files: 193 | 194 | for tld in asia us biz mobi info org net com; do ./load_mysql_data_all.sh --mysql-database=production_db_$tld --mysql-user=whoisuser --mysql-password=whoispassword --dump-files-dir=database_dump/mysqldump --tld=$tld --db-version=v20;done 195 | 196 | (Note that we have changed the name of the script only.) 197 | 198 | The above examples can be used as a template to manage various 199 | situations in bash. 200 | -------------------------------------------------------------------------------- /whoisxmlapi_bash_csv_to_mysqldb/load_csv_file_into_db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | #Sample script to load the downloaded csv into a database 4 | #Copyright (c) 2010-2021 Whois API LLC, http://www.whoisxmlapi.com 5 | # 6 | #Note: IF YOU ARE READING THIS SCRIPT JUST TO COLLECT IDEAS FOR YOUR OWN LOADER, 7 | # VISIT THE END OF THE FILE WHERE THE REAL WORK IS DONE 8 | # 9 | # Global variables. 10 | # 11 | LANG=C 12 | LC_ALL=C 13 | VERSION="0.0.3" 14 | VERBOSE="no" 15 | DEBUG="no" 16 | MYNAME=$(basename $0) 17 | 18 | 19 | 20 | #No mysql stuff by default. This is set by mandatory args. 21 | unset MYSQL_USER 22 | unset MYSQL_PASSWORD 23 | unset MYSQL_DATABASE 24 | 25 | # 26 | # Prints the version number and exits. 27 | # 28 | function printVersionAndExit() 29 | { 30 | echo "$MYNAME Version $VERSION" 31 | echo "" 32 | exit 0 33 | } 34 | 35 | function printHelpAndExit() 36 | { 37 | echo "Usage: $MYNAME [OPTION]..." 38 | echo "$MYNAME -- loads data from a csv file downloaded from WhoisXML API feeds" 39 | echo " into a table in a mysql database." 40 | echo "" 41 | echo " -h, --help Print this help and exit." 42 | echo " -v, --version Print version information and exit." 43 | echo " --verbose Print more messages." 44 | echo " --mysql-user=USERNAME User name to login to the mysql database (optional)." 45 | echo " --mysql-password=PASSWORD Password to login to the data source (optional)." 46 | echo " --mysql-database=DATABASE The name of the mysql database to load data into." 47 | echo " --csv-format=FORMAT The format of the csv file to be loaded. Must be one of 'regular', 'simple' or 'full'." 48 | echo " --schema-file=SCHEMAFILE The schema file to be used when loading. These are provided with the script." 49 | echo " --csv-file=CSVFILE The csv file to be loaded." 50 | echo "" 51 | echo "Example:" 52 | echo "$MYNAME --mysql-user=whoisuser --mysql-password=whoispassword --mysql-database=whoisdatabase --schema-file=loader_schema_simple.sql --csv-file=1.csv --csv-format=simple" 53 | echo "" 54 | echo 55 | echo "" 56 | echo "The table into which data are loaded is " 57 | echo " whois_record_flat for 'full' csv-s, " 58 | echo " whois_record_flat_simple for 'simple' csv-s, and" 59 | echo " whois_record_flat_regular for 'regular' csv-s." 60 | echo "" 61 | echo "Note: record id-s are auto incremented," 62 | echo " so each record is loaded again when the script is run." 63 | echo " This can lead to repetitions." 64 | echo "" 65 | exit 1 66 | } 67 | 68 | # 69 | # 70 | # Prints all the arguments but only if the program is in the verbose mode. 71 | # 72 | function printVerbose() 73 | { 74 | if [ "$VERBOSE" == "true" ]; then 75 | echo $* >&2 76 | fi 77 | } 78 | 79 | # 80 | # Prints an error message to the standard error. The text will not mixed up with 81 | # the data that is printed to the standard output. 82 | # 83 | function printError() 84 | { 85 | echo "$*" >&2 86 | } 87 | 88 | function printMessage() 89 | { 90 | echo -n "$*" >&2 91 | } 92 | 93 | function printMessageNl() 94 | { 95 | echo "$*" >&2 96 | } 97 | 98 | function printDebug() 99 | { 100 | if [ "$DEBUG" == "yes" ]; then 101 | echo "$*" >&2 102 | fi 103 | } 104 | 105 | 106 | ARGS=$(\ 107 | getopt -o hv \ 108 | -l "help,verbose,debug,version,mysql-database:,mysql-user:,mysql-password:,csv-format:,\ 109 | csv-file:,schema-file:" \ 110 | -- "$@") 111 | 112 | 113 | if [ $? -ne 0 ]; then 114 | exit 6 115 | fi 116 | 117 | eval set -- "$ARGS" 118 | 119 | while true; do 120 | case "$1" in 121 | -h|--help) 122 | shift 123 | printHelpAndExit 124 | ;; 125 | 126 | --verbose) 127 | shift 128 | VERBOSE="true" 129 | VERBOSEARG="--verbose" 130 | ;; 131 | 132 | --debug) 133 | shift 134 | DEBUG="yes" 135 | VERBOSEARG="--verbose" 136 | ;; 137 | 138 | -v|--version) 139 | shift 140 | printVersionAndExit 141 | ;; 142 | 143 | --mysql-user) 144 | shift 145 | MYSQL_USER=$1 146 | shift 147 | ;; 148 | 149 | --mysql-password) 150 | shift 151 | export MYSQL_PWD=$1 152 | shift 153 | ;; 154 | 155 | --mysql-database) 156 | shift 157 | MYSQL_DATABASE=$1 158 | shift 159 | ;; 160 | 161 | 162 | --csv-format) 163 | shift 164 | if echo $1 | grep --quiet -e "simple\|regular\|full"; then 165 | FORMAT=$1 166 | else 167 | printError "Supported csv formats are: simple, regular, and full." 168 | exit 1 169 | fi 170 | 171 | shift 172 | ;; 173 | 174 | --csv-file) 175 | shift 176 | CSV_FILE=$(readlink -e "$1") 177 | if ! [ -f "$CSV_FILE" ]; then 178 | printError "The csv file $CSV_FILE is not found." 179 | exit 111 180 | fi 181 | shift 182 | ;; 183 | 184 | --schema-file) 185 | shift 186 | SCHEMA_FILE=$(readlink -e "$1") 187 | if ! [ -f "$SCHEMA_FILE" ]; then 188 | printError "The schema file $SCHEMA_FILE is not found." 189 | exit 1 190 | fi 191 | shift 192 | ;; 193 | --) 194 | shift 195 | break 196 | ;; 197 | 198 | *) 199 | ;; 200 | esac 201 | done 202 | 203 | #some verification before doing the real job 204 | 205 | #Set up mysql login credentials if needed 206 | if [ -n "$MYSQL_USER" ]; then 207 | MYSQL_ARGUMENTS="--user=$MYSQL_USER" 208 | # if [ -n "$MYSQL_PASSWORD" ];then 209 | # MYSQL_ARGUMENTS="$MYSQL_ARGUMENTS --password=$MYSQL_PASSWORD" 210 | # fi; 211 | fi; 212 | 213 | printDebug "Mysql arguments: $MYSQL_ARGUMENTS" 214 | printDebug "Mysql password: $MYSQL_PWD" 215 | 216 | if [ -z "$MYSQL_DATABASE" ]; then 217 | printError "Mysql database not specified. See $MYNAME --help" 218 | exit 1 219 | fi 220 | if [ ! -f "$CSV_FILE" ]; then 221 | printError "Input csv file not specified or does not exist. See $MYNAME --help" 222 | exit 1 223 | fi 224 | if [ ! -f "$SCHEMA_FILE" ]; then 225 | printError "Schema file not specified or does not exist. See $MYNAME --help" 226 | exit 1 227 | fi 228 | CSV_FILE=`readlink -e $CSV_FILE` 229 | SCHEMA_FILE=`readlink -e $SCHEMA_FILE` 230 | case ${FORMAT} in 231 | simple|regular ) 232 | table="whois_record_flat_${FORMAT}" 233 | ;; 234 | full ) 235 | table="whois_record_flat" 236 | ;; 237 | * ) 238 | echo "FORMAT must be specified(simple, regular, or full)" 239 | exit 1 240 | ;; 241 | esac 242 | 243 | #HERE WE DO THE REAL WORK. 244 | #IF YOU USE THIS SCRIPT JUST TO COLLECT IDEAS, START READING HERE 245 | 246 | if [[ -z $(mysql $(eval echo "$MYSQL_ARGUMENTS") -A --skip-column-names ${MYSQL_DATABASE} <<< "SHOW TABLES LIKE \"${table}\";") ]] 247 | then 248 | printVerbose "Loading schema for table $table." 249 | mysql $(eval echo "$MYSQL_ARGUMENTS") ${MYSQL_DATABASE} ${VERBOSEARG} <${SCHEMA_FILE} 250 | else 251 | printVerbose "Not loading schema, $table exists." 252 | fi 253 | 254 | #Determining the line terminator of the csv file 255 | line_terminator="\\n" 256 | if file ${CSV_FILE} | grep -q CRLF ; then 257 | line_terminator="\\r\\n" 258 | printVerbose "Windows-style CRLF terminated input file detected." 259 | else 260 | printVerbose "UNIX-style LF terminated input file detected." 261 | fi 262 | 263 | fields=$(head -n 1 ${CSV_FILE}|sed 's/"//g') 264 | 265 | mysql $(eval echo "$MYSQL_ARGUMENTS") ${MYSQL_DATABASE} ${VERBOSEARG} -e "load data local infile \"${CSV_FILE}\" IGNORE into table $table 266 | fields terminated by ',' enclosed by '\"' LINES TERMINATED BY '${line_terminator}' IGNORE 1 LINES (${fields})" 267 | -------------------------------------------------------------------------------- /whoisxmlapi_flexible_csv_to_mysqldb/README: -------------------------------------------------------------------------------- 1 | flexible_csv_to_mysqldb.py -- a Python3 scripts to create and maintain a 2 | WHOIS database in MySQL, using csv files 3 | obtained from WhoisXML API. 4 | 5 | ver. 0.0.2 6 | 7 | Changelog: 8 | 9 | ver. 0.0.1, dated 2018.01.09. 10 | - initial release 11 | 12 | ver. 0.0.2, dated 2019.04.17. 13 | - introduced field_types.csv, fixed field types 14 | - added the --all-fields-as-text option 15 | - fixed file read with utf8 encoding on Windows platforms 16 | 17 | Contents: 18 | 19 | 1. INSTALLATION 20 | 2. OBTAINING DATA 21 | 3. USING THE SCRIPT 22 | 23 | 1. INSTALLATION 24 | 25 | This is a Python3 script, so you need Python 3 to be installed. Also, 26 | as it deals with mysql connections, you need the mysql.connector 27 | package for python3, too. 28 | 29 | Important: at the time of writing this README, mysql.connector is not 30 | yet available for Python with version greater than 3.5. Hence please 31 | do not use a newer version of Python 3 on any platform. 32 | 33 | For Linux/Mac OS X users, you typically need os packages named 34 | 35 | python3 36 | 37 | and 38 | 39 | python3-mysql.connector 40 | 41 | On Ubuntu and its derivatives, you can install them by running 42 | 43 | apt-get install python3 python3-pip 44 | 45 | as root. The mysql connector for Python is also available from its 46 | official webpage for your architecture: 47 | 48 | http://dev.mysql.com/downloads/connector/python 49 | 50 | On other Linux platforms, you install these requirements with the 51 | appropriate package manager. 52 | 53 | Windows users have two options: 54 | 55 | If you install Bash on Ubuntu on Windows for your Windows 10 system 56 | (it is very easy, see our short blog on the topic: ), you can do 57 | everything in the same way as if you were doing it under Ubuntu Linux. 58 | Another benefit of this approach is that you can also use our 59 | shell-script based solutions on your Windows system. 60 | 61 | If you prefer using native Python on your Windows system, this is also 62 | possible. In this case you need to install Python from 63 | 64 | http://www.python.org 65 | 66 | and mysql.connector available from 67 | 68 | https://dev.mysql.com/downloads/connector/python 69 | 70 | Having installed these, the script should work flawlessly from the DOS 71 | command line or PowerShell. 72 | 73 | 74 | 2. OBTAINING DATA 75 | 76 | This script is for data downloaded from data feeds of WhoisXML 77 | API. You may use it with any data, including those from daily and 78 | quarterly feeds, and for all formats, including "simple", "regular", 79 | and "full". 80 | 81 | Please consult the manuals of the data feeds regarding the format of 82 | the data. 83 | 84 | You can download data in very simply by using our Python download 85 | script, load_mysql_data.py. 86 | 87 | Our scripts are available on github under 88 | 89 | https://github.com/whois-api-llc/whois_database_download_support 90 | 91 | 92 | 3. USING THE SCRIPT 93 | 94 | The script is self-documenting, please see the details of its options 95 | in the output with the --help option: 96 | 97 | ./flexible_csv_to_mysqldb.py --help 98 | 99 | (in DOS command-line, please omit "./") 100 | 101 | We elucidate the use of the script by providing two simple examples, 102 | and outline additional functionality in their description. 103 | 104 | In all examples we use Linux/UNIX style subdirectory specifications, 105 | that is, the path elements are separated with "/". On Windows systems 106 | in DOS command-line or Powershell, you should use backslashes ("\") 107 | instead. 108 | 109 | Note: the script normally reads the data types of the fields for 110 | creating tables from the file "field_types.csv" supplied with the 111 | script. It should be in next to the script, i.e. in the same 112 | directory. When using the --all-fields-as-text option, all the field 113 | apart from the record id "id" and the "domainName" will be of SQL type 114 | "text". The "field_types.csv" is ignored in this case. This option, 115 | however, leads to less efficient data storage and limited indexing 116 | opportunities. 117 | 118 | Example 1. 119 | ---------- 120 | 121 | We have downloaded data from the daily feed "cctld_discovered_domain_names_whois" into csv files into a local directory 122 | 123 | /scratch/whois_data/cctld_discovered_domain_names_whois/ 124 | 125 | Our files are named *.csv.gz, e.g. 2017_11_12_eu.csv.gz. First we load 126 | all the csv.gz files in the directory into the database "csvload_test" 127 | into a newly created table "daily_test". 128 | 129 | This can be done with the following command: 130 | 131 | ./flexible_csv_to_mysqldb.py --mysql-user root --mysql-password MYSQLROOTPASSWORD --mysql-database csvload_test --overwrite-mysql-table --mysql-table daily_test --threads 4 /scratch/whois_data/cctld_discovered_domain_names_whois/*.csv.gz 132 | 133 | Some important comments and tips: 134 | 135 | - The script uses the downloaded csv.gz and tar.gz files. It 136 | uncompresses them first to a temporary directory which is "tmpcsv" 137 | in the working directory of the scripts, but can be specified 138 | alternatively with the --temp-dir command-line option. 139 | 140 | - The data to be loaded can be huge. 141 | 142 | For instance, if you download "regular" csv files for all gtlds from 143 | the quarterly release, the uncompressed files will need more than 144 | 1.4 terabytes. Also, the MySQL database itself will need a space of 145 | similar size in addition. 146 | 147 | Bear this in mind not only when specifying the temporary directory 148 | but also when designing your infrastructure and estimating the time 149 | needed for loading data. 150 | 151 | - The script will first test if all the csv-s are of the same 152 | structure. This structure is deduced from the file headers. It 153 | starts manipulating the database if and only if it is the case. 154 | 155 | - To test if it is doable without actually making any changes to the 156 | data, just add the --dry-run option 157 | 158 | - The script's execution can be made faster on multi-core systems by 159 | using multiple threads. 160 | 161 | E.g. for 4 threads, use the --threads 4 option as in the 162 | example. Note: the uncompression of the files is one-threaded, the 163 | option affect the loading procedure only. Each thread processes one 164 | csv file, so multithreading is useful for multiple csv-s. 165 | 166 | - The database will be created if it does not exist. 167 | 168 | - If you prefer another MySQL user than root, you need to ensure that 169 | the user has sufficient privileges to do the operations. 170 | 171 | - You can add additional data with the same structure to your existing 172 | database. 173 | 174 | E.g. from newly downloaded csv-s, you just need to use the same 175 | command-line without the --overwrite-mysql-table option. 176 | 177 | - However, be careful when using --overwrite-mysql-table : if the 178 | table already exists, the script will drop it, and create an empty 179 | table again. 180 | 181 | Example 2. 182 | ---------- 183 | 184 | Our data originate now from the cctld quarterly data feed v6. We have 185 | downloaded the data for the ".eu" tld into the file 186 | 187 | /scratch/whois_data/v6/csv/tlds/simple/csvs.eu.simple.tar.gz 188 | 189 | We will load these data into the table "quarterly_test" of the 190 | database "csvload_test". The command is: 191 | 192 | ./flexible_csv_to_mysqldb.py --mysql-user root --mysql-password MYSQLROOTPASSWORD --mysql-database csvload_test --mysql-table quarterly_test --overwrite-mysql-table /scratch/whois_data/v6/csv/tlds/simple/csvs.eu.simple.tar.gz 193 | 194 | 195 | 196 | 197 | 198 | -------------------------------------------------------------------------------- /whoisxmlapi_bash_csv_to_mysqldb/README: -------------------------------------------------------------------------------- 1 | README document for 2 | 3 | CSV importing scripts and schema 4 | 5 | Document version: 1.1 6 | Dated: 30-11-2018. 7 | 8 | Contents: 9 | --------- 10 | 1. List of files 11 | 2. Script specification 12 | 3. Example of use 13 | 14 | 1. List of files 15 | ---------------- 16 | 17 | load_csv_file_into_db.sh -- loader script in Bash, this is the one documented here 18 | 19 | load_csv_file_into_db_old.sh -- legacy version of loader script in 20 | Bash, used till July 2017. 21 | 22 | loader_schema_full.sql -- schema file for full csv-s 23 | (daily and quarterly feeds) 24 | loader_schema_regular.sql -- schema file for regular csv-s for both 25 | quarterly and daily feeds. 26 | The fields registrant_rawText, 27 | administrativeContact_rawText, 28 | billingContact_rawText, 29 | technicalContact_rawText, 30 | and zoneContact_rawText 31 | will remain empty when csv-s 32 | from daily feeds are imported. 33 | 34 | loader_schema_regular_daily_only.sql -- schema file for regular csv-s 35 | for data from daily feeds only. 36 | Does not contain the 37 | raw text fields 38 | which are only present 39 | in quarterly data. 40 | 41 | loader_schema_simple.sql -- schema file for simple csv-s 42 | (quarterly feeds only) 43 | README -- this file 44 | 45 | 2. Script specification 46 | ----------------------- 47 | 48 | Name: load_csv_file_into_db.sh -- loads data from a csv file downloaded from WhoisXML API feeds 49 | into a table in a mysql database. 50 | 51 | Synopsis: load_csv_file_into_db.sh [OPTION]... 52 | 53 | Description: The script loads a csv file with Whois information, 54 | downloaded from a subscription to WhoisXML API feeds into a 55 | database table. The type of the csv file (simple, regular, or 56 | full, see the manual of the feeds) should be specified and the 57 | appropriate schema file supplied here should be used. 58 | The table into which data are loaded is 59 | whois_record_flat for 'full' csv-s, 60 | whois_record_flat_simple for 'simple' csv-s, and 61 | whois_record_flat_regular for 'regular' csv-s. 62 | 63 | The command-line options of the script are: 64 | 65 | -h, --help Print this help and exit. 66 | -v, --version Print version information and exit. 67 | --verbose Print more messages. 68 | --mysql-user=USERNAME User name to login to the mysql database (optional). 69 | --mysql-password=PASSWORD Password to login to the data source (optional). 70 | --mysql-database=DATABASE The name of the mysql database to load data into. 71 | --csv-format=FORMAT The format of the csv file to be loaded. Must be one of 'regular', 'simple' or 'full'. 72 | --schema-file=SCHEMAFILE The schema file to be used when loading. These are provided with the script. 73 | --csv-file=CSVFILE The csv file to be loaded. 74 | 75 | 3. Example of use 76 | ----------------- 77 | 78 | Here we describe a simple and complete workflow. 79 | 80 | Assume we want to have the quarterly data from the v19 version of 81 | quarterly databases, for the ``aeg'' and ``active'' tlds in a mysql 82 | database. You intend to do this by downloading csv files and importing 83 | them. Here is what to do, from scratch. 84 | 85 | Step 1. Download some simple csv-s 86 | 87 | Download the latest version of whoisdownload_bash from the docs 88 | subdirectory of quarterly releases and from 89 | http://bestwhois.org/domain_name_data/docs/scripts, depending on your 90 | subscription. 91 | 92 | The actual version was whoisdownload_bash-0.0.16.tar.gz when this 93 | manual was written, but it maybe newer, please download the latest 94 | available one and use its name in the command-line below. Untar the 95 | file, and change into its directory: 96 | 97 | tar zxvf whoisdownload_bash-0.0.16.tar.gz;cd whoisdownload_bash 98 | 99 | In order to download the required csv-s, we use this download scripts 100 | in the following form (please substitute the CAPITALIZED words with 101 | your username and password): 102 | 103 | ./whoisdownload.sh --verbose --user USERNAME --password PASSWORD --db-version v19 --data-feeds whois_database --tld "aeg active" --file-format simple 104 | 105 | This will have the following output to stdout: 106 | 107 | whois_database/v19/csv/tlds/simple/csvs.aeg.simple.tar.gz [OK] 108 | whois_database/v19/csv/tlds/simple/csvs.active.simple.tar.gz [OK] 109 | 110 | (If instead of [OK] you get something else, then it is likely that 111 | there is something wrong with your password.) 112 | 113 | The resulting files are in the subdirectory 114 | 115 | whois_database/v19/csv/tlds/simple/ 116 | 117 | (You may modify this behavior with the command-line argument 118 | --output-dir of whoisdownloader.sh, see also 119 | whoisdownload.sh --help 120 | ) 121 | 122 | Move the files, csvs.active.simple.tar.gz and csvs.aeg.simple.tar.gz 123 | to a suitable directory, and uncompress them: 124 | 125 | tar zxvf csvs.active.simple.tar.gz 126 | tar zxvf csvs.aeg.simple.tar.gz 127 | 128 | Now you have a simple subdirectory, in which there are the aeg and 129 | active subdirectories, in which there are the csv files (in the 130 | example, only one of them, named 1.csv). 131 | 132 | Step 2 (optional). 133 | Prepare your mysql database. We assume that mysql is already 134 | installed and you can administer it. Were this not the case, please 135 | consult the documentation of mysql. 136 | 137 | Important: in order to load data into mysql from files, you need to 138 | disable the ``secure-file-priv'' option of mysql. This can be done by 139 | adding the following line: 140 | 141 | secure-file-priv = "" 142 | 143 | to your mysqld configuration file in the section [mysqld]. (The 144 | location of the configuration file varies with the 145 | installation. Traditionally it is called my.cnf. In Ubuntu systems, 146 | for instance, you will find this section in 147 | /etc/mysqld/mysql.conf.d/mysqld.cnf.) After editing the config you 148 | need to restart the mysql service. 149 | 150 | If you already have access credentials for the appropriate user and a 151 | database you want to use exists, you may omit the next steps. In our 152 | example we shall use the username ``whoisuser'' who will have the 153 | password ``whoispassword''. 154 | 155 | Otherwise create the respective user: as the mysql administrator do 156 | 157 | CREATE USER whoisuser IDENTIFIED BY 'whoispassword'; 158 | 159 | Create a database (``whoisdatabase'' in this example) which we shall 160 | use: 161 | 162 | CREATE DATABASE whoisdatabase; 163 | 164 | Grant all privileges on this database to this user: 165 | 166 | GRANT ALL ON whoisdatabase.* to whoisuser; 167 | 168 | Finally, as you want to load data from files, the user needs to have 169 | the required privileges, so do: 170 | 171 | GRANT file ON *.* to whoisuser; 172 | 173 | 3. Load your data into the file 174 | 175 | Assume that you have uncompressed the downloaded files in the same 176 | subdirectory where the script resides. To load the files you have to 177 | do 178 | 179 | ./load_csv_file_into_db.sh --mysql-user=whoisuser --mysql-password=whoispassword --mysql-database=whoisdatabase --schema-file=loader_schema_simple.sql --csv-file=simple/aeg/1.csv --csv-format=simple 180 | 181 | and for the other domain: 182 | 183 | ./load_csv_file_into_db.sh --mysql-user=whoisuser --mysql-password=whoispassword --mysql-database=whoisdatabase --schema-file=loader_schema_simple.sql --csv-file=simple/active/1.csv --csv-format=simple 184 | 185 | Logging in to the database, the data will be there in the table 186 | "whois_record_flat_simple", you can now work with them. 187 | 188 | Note again that the script creates each record which was in the input 189 | csv, it does not check if it already existed. This may lead to 190 | repetitions, e.g. when the script is run twice with the same 191 | arguments. 192 | 193 | -------end of document------- -------------------------------------------------------------------------------- /whoisxmlapi_whoisdownload_bash/supported_ngtlds: -------------------------------------------------------------------------------- 1 | aaa,aarp,abarth,abb,abbott,abbvie,abc,able,abogado,abudhabi,academy,accenture,accountant,accountants,aco,active,actor,adac,ads,adult,aeg,aetna,afamilycompany,afl,agakhan,agency,aig,aigo,airbus,airforce,airtel,akdn,alfaromeo,alibaba,alipay,allfinanz,allstate,ally,alsace,americanexpress,americanfamily,amex,amfam,amica,amsterdam,analytics,android,anquan,anz,aol,apartments,app,apple,aquarelle,aramco,archi,army,art,arte,asda,associates,athleta,attorney,auction,audi,audible,audio,auspost,author,auto,autos,avianca,aws,axa,azure,baby,baidu,banamex,bananarepublic,band,bank,bar,barcelona,barclaycard,barclays,barefoot,bargains,baseball,basketball,bauhaus,bayern,bbc,bbt,bbva,bcg,bcn,beats,beauty,beer,bentley,berlin,best,bestbuy,bet,bharti,bible,bid,bike,bing,bingo,bio,black,blackfriday,blanco,blockbuster,blog,bloomberg,blue,bms,bmw,bnl,boats,boehringer,bofa,bom,bond,boo,book,booking,boots,bosch,bostik,boston,bot,boutique,box,bradesco,bridgestone,broadway,broker,brother,brussels,budapest,bugatti,build,builders,business,buy,buzz,bzh,cab,cafe,cal,call,calvinklein,cam,camera,camp,cancerresearch,canon,capetown,capital,capitalone,car,caravan,cards,care,career,careers,cars,cartier,casa,case,caseih,cash,casino,cat,catering,catholic,cba,cbn,cbre,cbs,ceb,center,ceo,cern,cfa,cfd,chanel,channel,chase,chat,cheap,chintai,chloe,christmas,chrome,chrysler,church,cipriani,circle,cisco,citadel,citi,citic,city,cityeats,claims,cleaning,click,clinic,clinique,clothing,cloud,club,clubmed,coach,codes,coffee,college,cologne,comcast,commbank,community,company,compare,computer,comsec,condos,construction,consulting,contact,contractors,cooking,cookingchannel,cool,coop,corsica,country,coupon,coupons,courses,credit,creditcard,creditunion,cricket,crown,crs,cruise,cruises,csc,cuisinella,cymru,cyou,dabur,dad,dance,date,dating,datsun,day,dclk,dds,deal,dealer,deals,degree,delivery,dell,deloitte,delta,democrat,dental,dentist,desi,design,dev,dhl,diamonds,diet,digital,direct,directory,discount,discover,dish,diy,dnp,docs,doctor,dodge,dog,doha,domains,dot,download,drive,dtv,dubai,duck,dunlop,duns,dupont,durban,dvag,dvr,earth,eat,eco,edeka,education,email,emerck,energy,engineer,engineering,enterprises,epost,epson,equipment,ericsson,erni,esq,estate,esurance,eurovision,eus,events,everbank,exchange,expert,exposed,express,extraspace,fage,fail,fairwinds,faith,family,fan,fans,farm,farmers,fashion,fast,fedex,feedback,ferrari,ferrero,fiat,fidelity,fido,film,final,finance,financial,fire,firestone,firmdale,fish,fishing,fit,fitness,flickr,flights,flir,florist,flowers,fly,foo,food,foodnetwork,football,ford,forex,forsale,forum,foundation,fox,free,fresenius,frl,frogans,frontdoor,frontier,ftr,fujitsu,fujixerox,fund,furniture,futbol,fyi,gal,gallery,gallo,gallup,game,games,gap,garden,gbiz,gdn,gea,gent,genting,george,ggee,gift,gifts,gives,giving,glade,glass,gle,global,globo,gmail,gmbh,gmo,gmx,godaddy,gold,goldpoint,golf,goo,goodhands,goodyear,goog,google,gop,got,grainger,graphics,gratis,green,gripe,group,guardian,gucci,guge,guide,guitars,guru,hamburg,hangout,haus,hbo,hdfc,hdfcbank,health,healthcare,help,helsinki,here,hermes,hgtv,hiphop,hisamitsu,hitachi,hiv,hkt,hockey,holdings,holiday,homedepot,homegoods,homes,homesense,honda,honeywell,horse,hospital,host,hosting,hot,hoteles,hotmail,house,how,hsbc,hughes,hyatt,hyundai,ibm,icbc,ice,icu,ieee,ifm,iinet,ikano,imamat,imdb,immo,immobilien,industries,infiniti,ing,ink,institute,insurance,insure,intel,international,intuit,investments,ipiranga,irish,iselect,ismaili,ist,istanbul,itau,itv,iveco,iwc,jaguar,java,jcb,jcp,jeep,jetzt,jewelry,jio,jlc,jll,jmp,jnj,jobs,joburg,jot,joy,jpmorgan,jprs,juegos,juniper,kaufen,kddi,kerryhotels,kerrylogistics,kerryproperties,kfh,kia,kim,kinder,kindle,kitchen,kiwi,koeln,komatsu,kosher,kpmg,kpn,krd,kred,kuokgroup,kyoto,lacaixa,ladbrokes,lamborghini,lamer,lancaster,lancia,lancome,land,landrover,lanxess,lasalle,lat,latino,latrobe,law,lawyer,lds,lease,leclerc,lefrak,legal,lego,lexus,lgbt,liaison,lidl,life,lifeinsurance,lifestyle,lighting,like,lilly,limited,limo,lincoln,linde,link,lipsy,live,living,lixil,loan,loans,locker,locus,loft,lol,london,lotte,lotto,love,lpl,lplfinancial,ltd,ltda,lundbeck,lupin,luxe,luxury,macys,madrid,maif,maison,makeup,man,management,mango,market,marketing,markets,marriott,marshalls,maserati,mattel,mba,mcd,mcdonalds,mckinsey,med,media,meet,melbourne,meme,memorial,men,menu,meo,metlife,miami,microsoft,mini,mint,mit,mitsubishi,mlb,mls,mma,mobily,moda,moe,moi,mom,monash,money,monster,montblanc,mopar,mormon,mortgage,moscow,moto,motorcycles,mov,movie,movistar,msd,mtn,mtpc,mtr,museum,mutual,mutuelle,nab,nadex,nagoya,nationwide,natura,navy,nba,nec,netbank,netflix,network,neustar,new,newholland,news,next,nextdirect,nexus,nfl,ngo,nhk,nico,nike,nikon,ninja,nissan,nissay,nokia,northwesternmutual,norton,now,nowruz,nowtv,nra,nrw,ntt,nyc,obi,observer,off,office,okinawa,olayan,olayangroup,oldnavy,ollo,omega,one,ong,onl,online,onyourside,ooo,open,oracle,orange,organic,orientexpress,origins,osaka,otsuka,ott,ovh,page,pamperedchef,panasonic,panerai,paris,pars,partners,parts,party,passagens,pay,pccw,pet,pfizer,pharmacy,philips,photo,photography,photos,physio,piaget,pics,pictet,pictures,pid,pin,ping,pink,pioneer,pizza,place,play,playstation,plumbing,plus,pnc,pohl,poker,politie,porn,pramerica,praxi,press,prime,pro,prod,productions,prof,progressive,promo,properties,property,protection,pru,prudential,pub,pwc,qpon,quebec,quest,qvc,racing,radio,raid,read,realestate,realtor,realty,recipes,red,redstone,redumbrella,rehab,reise,reisen,reit,reliance,ren,rent,rentals,repair,report,republican,rest,restaurant,review,reviews,rexroth,rich,richardli,ricoh,rightathome,ril,rio,rip,rmit,rocher,rocks,rodeo,rogers,room,rsvp,ruhr,run,rwe,ryukyu,saarland,safe,safety,sakura,sale,salon,samsclub,samsung,sandvik,sandvikcoromant,sanofi,sap,sapo,sarl,sas,save,saxo,sbi,sbs,sca,scb,schaeffler,schmidt,scholarships,school,schule,schwarz,science,scjohnson,scor,scot,seat,secure,security,seek,select,sener,services,ses,seven,sew,sex,sexy,sfr,shangrila,sharp,shell,shia,shiksha,shoes,shop,shopping,shouji,show,showtime,shriram,silk,sina,singles,site,ski,skin,sky,skype,sling,smart,smile,sncf,soccer,social,softbank,software,sohu,solar,solutions,song,sony,soy,space,spiegel,spot,spreadbetting,srl,srt,stada,staples,star,starhub,statebank,statefarm,statoil,stc,stcgroup,stockholm,storage,store,stream,studio,study,style,sucks,supplies,supply,support,surf,surgery,suzuki,swatch,swiftcover,swiss,sydney,symantec,systems,tab,taipei,talk,taobao,target,tatamotors,tatar,tattoo,tax,taxi,tci,tdk,team,tech,technology,telecity,telefonica,temasek,tennis,teva,thd,theater,theatre,tiaa,tickets,tienda,tiffany,tips,tires,tirol,tjmaxx,tjx,tkmaxx,tmall,today,tokyo,tools,top,toray,toshiba,total,tours,town,toyota,toys,trade,trading,training,travel,travelchannel,travelers,travelersinsurance,trust,trv,tube,tui,tunes,tushu,tvs,ubank,ubs,uconnect,university,uno,uol,ups,vacations,vana,vanguard,vegas,ventures,verisign,versicherung,vet,viajes,video,vig,viking,villas,vin,vip,virgin,visa,vision,vista,vistaprint,viva,vivo,vlaanderen,vodka,volkswagen,volvo,vote,voto,voyage,vuelos,wales,walmart,walter,wang,wanggou,warman,watch,watches,weather,weatherchannel,webcam,weber,website,wed,wedding,weibo,weir,whoswho,wien,wiki,williamhill,win,windows,wine,winners,wme,wolterskluwer,woodside,work,works,world,wow,wtc,wtf,xbox,xerox,xfinity,xihuan,xin,xn--11b4c3d,xn--1ck2e1b,xn--1qqw23a,xn--30rr7y,xn--3bst00m,xn--3ds443g,xn--3oq18vl8pn36a,xn--3pxu8k,xn--42c2d9a,xn--45q11c,xn--4gbrim,xn--55qw42g,xn--55qx5d,xn--5su34j936bgsg,xn--5tzm5g,xn--6frz82g,xn--6qq986b3xl,xn--80adxhks,xn--80aqecdr1a,xn--80asehdb,xn--80aswg,xn--9dbq2a,xn--9et52u,xn--9krt00a,xn--b4w605ferd,xn--bck1b9a5dre4c,xn--c1avg,xn--c2br7g,xn--cck2b3b,xn--cg4bki,xn--czr694b,xn--czrs0t,xn--czru2d,xn--d1acj3b,xn--eckvdtc9d,xn--efvy88h,xn--estv75g,xn--fct429k,xn--fhbei,xn--fiq228c5hs,xn--fiq64b,xn--fjq720a,xn--flw351e,xn--fzys8d69uvgm,xn--g2xx48c,xn--gckr3f0f,xn--gk3at1e,xn--hxt814e,xn--i1b6b1a6a2e,xn--imr513n,xn--io0a7i,xn--j1aef,xn--jlq61u9w7b,xn--jvr189m,xn--kcrx77d1x4a,xn--kpu716f,xn--kput3i,xn--mgba3a3ejt,xn--mgba7c0bbn0a,xn--mgbab2bd,xn--mgbb9fbpob,xn--mgbca7dzdo,xn--mgbi4ecexp,xn--mgbt3dhd,xn--mk1bu44c,xn--ngbc5azd,xn--ngbe9e0a,xn--nqv7f,xn--nqv7fs00ema,xn--nyqy26a,xn--p1acf,xn--pbt977c,xn--pssy2u,xn--q9jyb4c,xn--qcka1pmc,xn--rhqv96g,xn--rovu88b,xn--ses554g,xn--t60b56a,xn--tckwe,xn--tiq49xqyj,xn--unup4y,xn--vermgensberater-ctb,xn--vermgensberatung-pwb,xn--vhquv,xn--vuq861b,xn--w4r85el8fhu5dnra,xn--w4rs40l,xn--xhq521b,xn--zfr164b,xperia,xxx,xyz,yachts,yahoo,yamaxun,yandex,yodobashi,yoga,yokohama,you,youtube,yun,zappos,zara,zero,zip,zippo,zone,zuerich, 2 | -------------------------------------------------------------------------------- /whoisxmlapi_mysqldump_loaders/load_mysql_data_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | #Sample script to load ASCII mysql dumps for a tld 4 | #This loads schema+data from a single backup file 5 | #Recommended for smaller tlds. 6 | #Copyright (c) 2010-2021 Whois API LLC, http://www.whoisxmlapi.com 7 | # 8 | #Note: IF YOU ARE READING THIS SCRIPT JUST TO COLLECT IDEAS FOR YOUR OWN LOADER, 9 | # VISIT THE END OF THE FILE WHERE THE REAL WORK IS DONE 10 | # 11 | # Global variables. 12 | # 13 | LANG=C 14 | LC_ALL=C 15 | VERSION="0.0.3" 16 | VERBOSE="yes" 17 | DEBUG="no" 18 | SHOWPROGRESS="no" 19 | MYNAME=$(basename $0) 20 | CATCOMMAND="cat" 21 | 22 | #No mysql stuff by default. This is set by mandatory args. 23 | unset MYSQL_USER 24 | unset MYSQL_PASSWORD 25 | unset MYSQL_DATABASE 26 | 27 | #Importing generic utilities 28 | 29 | source load_mysql_utils.sh 30 | 31 | function printHelpAndExit() 32 | { 33 | echo "Usage: $MYNAME [OPTION]..." 34 | echo "$MYNAME -- loads data for a given tld" 35 | echo "from a schema file and separate table files " 36 | echo " into a table in a mysql database." 37 | echo "" 38 | echo " -h, --help Print this help and exit." 39 | echo " -v, --version Print version information and exit." 40 | echo " --verbose Print more messages." 41 | echo " --show-progress Display progress bars when loading data from dumps." 42 | echo " Recommended, especially for large domains." 43 | echo " --mysql-user=USERNAME User name to login to the mysql database (optional)." 44 | echo " --mysql-password=PASSWORD Password to login to the data source (optional)." 45 | echo " --mysql-database=DATABASE The name of the mysql database to load data into. " 46 | echo " This database is created by the script, so should not exist" 47 | echo " --dump-file=DUMPFILE The file to be loaded. If this is provided," 48 | echo " the rest of the options are invalid" 49 | echo " --tld=TLD load data for this tld" 50 | echo " --dump-files-dir=DIRECTORY The dump files for the tld-s are in this directory. Only for --tld" 51 | echo " --db-version=STRING The version to load download. Required for --tld Format: vNN, e.g. v19" 52 | echo "" 53 | echo "Examples:" 54 | echo "" 55 | echo " -loading sample data downloaded into a directory mysqldump_sample from " 56 | echo " http://domainwhoisdatabase.com/whois_database/sample/gtlds/v20/mysqldump_sample/aaa" 57 | echo "" 58 | echo "$MYNAME --mysql-database=sample_db_aaa --mysql-user=whoisuser --mysql-password=whoispassword --dump-files-dir=mysqldump_sample --db-version=v20 --tld=aaa --verbose --show-progress" 59 | echo "" 60 | echo " or the same task specifying the file name and path directly:" 61 | echo "" 62 | echo "$MYNAME --mysql-database=sample_db_aaa --mysql-user=whoisuser --mysql-password=whoispassword --dump-file=mysqldump_sample/aaa/whoiscrawler_v20_aaa_mysql.sql.gz --verbose --show-progress" 63 | echo "" 64 | echo "" 65 | echo " -loading production data quietly, downloaded into a directory database_dump/mysqldump/aaa from" 66 | echo " http://www.domainwhoisdatabase.com/whois_database/v20/database_dump/mysqldump/aaa" 67 | echo "" 68 | echo "$MYNAME --mysql-database=production_db_aaa --mysql-user=whoisuser --mysql-password=whoispassword --dump-files-dir=database_dump/mysqldump --tld=aaa --db-version=v20" 69 | echo "" 70 | echo " or the same task verbosely, specifying the file name and path directly:" 71 | echo "" 72 | echo "$MYNAME --mysql-database=production_db_aaa --mysql-user=whoisuser --mysql-password=whoispassword --dump-file=database_dump/mysqldump/aaa/whoiscrawler_v20_aaa_mysql.sql.gz --verbose --show-progress" 73 | 74 | exit 1 75 | } 76 | 77 | ARGS=$(\ 78 | getopt -o hv \ 79 | -l "help,verbose,debug,show-progress,version,v,mysql-database:,mysql-user:,mysql-password:,\ 80 | dump-file:,tld:,dump-files-dir:,db-version:" \ 81 | -- "$@") 82 | 83 | 84 | if [ $? -ne 0 ]; then 85 | exit 6 86 | fi 87 | 88 | eval set -- "$ARGS" 89 | 90 | while true; do 91 | case "$1" in 92 | -h|--help) 93 | shift 94 | printHelpAndExit 95 | ;; 96 | 97 | --verbose) 98 | shift 99 | VERBOSE="true" 100 | ;; 101 | 102 | --debug) 103 | shift 104 | DEBUG="yes" 105 | VERBOSEARG="--verbose" 106 | ;; 107 | 108 | --show-progress) 109 | shift 110 | if which pv > /dev/null;then 111 | CATCOMMAND="pv" 112 | else 113 | printError "The show-progress argument needs pv to be installed (e.g. apt-get install pv)" 114 | exit 1 115 | fi 116 | ;; 117 | 118 | -v|--version) 119 | shift 120 | printVersionAndExit 121 | ;; 122 | 123 | --mysql-user) 124 | shift 125 | db_username=$1 126 | shift 127 | ;; 128 | 129 | --mysql-password) 130 | shift 131 | export MYSQL_PWD=$1 132 | shift 133 | ;; 134 | 135 | --mysql-database) 136 | shift 137 | db=$1 138 | shift 139 | ;; 140 | 141 | --dump-file) 142 | shift 143 | dump_file=$(readlink -e "$1") 144 | if ! [ -f "$dump_file" ]; then 145 | printError "The specified mysql file $dump_file is not found." 146 | exit 1 147 | fi 148 | shift 149 | ;; 150 | 151 | --tld) 152 | shift 153 | TLD=$1 154 | shift 155 | ;; 156 | 157 | --dump-files-dir) 158 | shift 159 | DUMP_FILES_DIR=$1 160 | if ! [ -d "$DUMP_FILES_DIR" ]; then 161 | printError "The specified dump file directory does not exist." 162 | exit 1 163 | fi 164 | shift 165 | ;; 166 | 167 | --db-version) 168 | shift 169 | #format check 170 | if echo $1 | grep --quiet -e "v[0-9]*"; then 171 | DATABASE_VERSION=$1 172 | else 173 | printError "Invalid db-version specification. It should be like v19 or v6" 174 | exit 1 175 | fi 176 | shift 177 | ;; 178 | 179 | --) 180 | shift 181 | break 182 | ;; 183 | 184 | *) 185 | ;; 186 | esac 187 | done 188 | 189 | #some verification before doing the real job 190 | 191 | if [ -n "$dump_file" ] && [ -n "$TLD" -o -n "$DUMP_FILES_DIR" -o -n "$DATABASE_VERSION" ]; then 192 | printError "Conflicting arguments. Please use either --dump-file or --tld + --dump-files-dir + --db-version." 193 | exit 1 194 | fi 195 | 196 | #Set up mysql login credentials if needed 197 | if [ -n "$db_username" ]; then 198 | MYSQL_ARGUMENTS="--user=$db_username" 199 | fi 200 | 201 | printDebug "Mysql arguments: $MYSQL_ARGUMENTS" 202 | printDebug "Mysql Password: $MYSQL_PWD" 203 | 204 | if [ -z "$db" ]; then 205 | printError "Mysql database not specified. See $MYNAME --help" 206 | exit 1 207 | fi 208 | 209 | #If the tld is specified, we find out the dumpfile name. 210 | if [ -z $dump_file ]; then 211 | dump_file="$DUMP_FILES_DIR"/"$TLD"/whoiscrawler_"$TLD"_mysql.sql.gz 212 | if [ ! -f "$dump_file" ]; then 213 | dump_file="$DUMP_FILES_DIR"/"$TLD"/whoiscrawler_"$DATABASE_VERSION"_"$TLD"_mysql.sql.gz 214 | fi 215 | #Quarterly feeds case 216 | if [ ! -f "$dump_file" ]; then 217 | TLDUNDERSCORE=$(echo $TLD | sed -e "s/\./_/g") 218 | dump_file="$DUMP_FILES_DIR"/"$TLD"/domains_whoiscrawler_"$DATABASE_VERSION"_"$TLDUNDERSCORE"_mysql.sql.gz 219 | fi 220 | fi 221 | 222 | printVerbose "Dump file: $dump_file" 223 | 224 | if [ ! -f "$dump_file" ]; then 225 | printError "Database dump to be loaded is not specified or it does not exist." 226 | printError "See $MYNAME --help" 227 | exit 1 228 | fi 229 | 230 | #THE REAL WORK STARTS HERE 231 | printVerbose "Creating database $db" 232 | mysql ${MYSQL_ARGUMENTS} ${VERBOSEARG} -e "create database $db" 233 | time=`date +%s` 234 | echo "loading data from file $dump_file" 235 | if [ ${dump_file: -3} == ".gz" ]; then 236 | 237 | $CATCOMMAND "$dump_file" | gunzip -c |mysql ${MYSQL_ARGUMENTS} $db 238 | else 239 | 240 | $CATCOMMAND $dump_file | mysql ${MYSQL_ARGUMENTS} 241 | fi 242 | 243 | time2=`date +%s` 244 | dur=`expr $time2 - $time` 245 | echo "took $dur seconds." 246 | 247 | -------------------------------------------------------------------------------- /whoisxmlapi_percona_loader_scripts/whoiscrawler_mysql_schema.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.13 Distrib 5.5.8, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: whoiscrawler 4 | -- ------------------------------------------------------ 5 | -- Server version 5.5.8-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; 14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 17 | 18 | -- 19 | -- Table structure for table `contact` 20 | -- 21 | 22 | /*!40101 SET @saved_cs_client = @@character_set_client */; 23 | /*!40101 SET character_set_client = utf8 */; 24 | CREATE TABLE IF NOT EXISTS `contact` ( 25 | `contact_id` bigint(20) NOT NULL AUTO_INCREMENT, 26 | `name` varchar(256) DEFAULT NULL, 27 | `organization` varchar(256) DEFAULT NULL, 28 | `street1` varchar(256) DEFAULT NULL, 29 | `street2` varchar(256) DEFAULT NULL, 30 | `street3` varchar(256) DEFAULT NULL, 31 | `street4` varchar(256) DEFAULT NULL, 32 | `city` varchar(64) DEFAULT NULL, 33 | `state` varchar(45) DEFAULT NULL, 34 | `postal_code` varchar(45) DEFAULT NULL, 35 | `country` varchar(45) DEFAULT NULL, 36 | `email` varchar(256) DEFAULT NULL, 37 | `telephone` varchar(45) DEFAULT NULL, 38 | `telephone_ext` varchar(45) DEFAULT NULL, 39 | `fax` varchar(45) DEFAULT NULL, 40 | `fax_ext` varchar(45) DEFAULT NULL, 41 | `parse_code` smallint(6) DEFAULT NULL, 42 | `raw_text` longtext, 43 | `unparsable` longtext, 44 | `audit_created_date` varchar(45) DEFAULT NULL, 45 | `audit_updated_date` varchar(45) DEFAULT NULL, 46 | PRIMARY KEY (`contact_id`), 47 | KEY `audit_updated_date` (`audit_updated_date`) 48 | ) ENGINE=InnoDB AUTO_INCREMENT=1 ROW_FORMAT=COMPRESSED DEFAULT CHARSET=utf8; 49 | /*!40101 SET character_set_client = @saved_cs_client */; 50 | 51 | -- 52 | -- Table structure for table `domain_names_whoisdatacollector` 53 | -- 54 | 55 | /*!40101 SET @saved_cs_client = @@character_set_client */; 56 | /*!40101 SET character_set_client = utf8 */; 57 | CREATE TABLE IF NOT EXISTS `domain_names_whoisdatacollector` ( 58 | `domain_id` bigint(20) NOT NULL AUTO_INCREMENT, 59 | `domain_name` varchar(256) CHARACTER SET latin1 NOT NULL, 60 | `reshoot` smallint(6) DEFAULT '0', 61 | processed int default 0, 62 | registrar_name varchar(512), 63 | registrar_raw_text longtext, 64 | registry_raw_text longtext, 65 | processed_time timestamp, 66 | PRIMARY KEY (`domain_id`), 67 | UNIQUE KEY `domain_name` (`domain_name`), 68 | KEY `reshoot` (`reshoot`), 69 | KEY `processed` (`processed`) 70 | ) ENGINE=InnoDB AUTO_INCREMENT=1 ROW_FORMAT=COMPRESSED DEFAULT CHARSET=utf8; 71 | /*!40101 SET character_set_client = @saved_cs_client */; 72 | -- 73 | -- Table structure for table `registry_data` 74 | -- 75 | 76 | /*!40101 SET @saved_cs_client = @@character_set_client */; 77 | /*!40101 SET character_set_client = utf8 */; 78 | CREATE TABLE IF NOT EXISTS `registry_data` ( 79 | `registry_data_id` bigint(20) NOT NULL AUTO_INCREMENT, 80 | `created_date` varchar(200) DEFAULT NULL, 81 | `updated_date` varchar(200) DEFAULT NULL, 82 | `expires_date` varchar(200) DEFAULT NULL, 83 | `admin_contact_id` bigint(11) DEFAULT NULL, 84 | `registrant_id` bigint(11) DEFAULT NULL, 85 | `technical_contact_id` bigint(11) DEFAULT NULL, 86 | `zone_contact_id` bigint(11) DEFAULT NULL, 87 | `billing_contact_id` bigint(11) DEFAULT NULL, 88 | `domain_name` varchar(256) CHARACTER SET latin1 DEFAULT NULL, 89 | `name_servers` text, 90 | `status` text, 91 | `raw_text` longtext, 92 | `audit_created_date` timestamp NULL DEFAULT NULL, 93 | `audit_updated_date` timestamp NULL DEFAULT NULL, 94 | `unparsable` longtext, 95 | `parse_code` smallint(6) DEFAULT NULL, 96 | `header_text` longtext, 97 | `clean_text` longtext, 98 | `footer_text` longtext, 99 | `registrar_name` varchar(512) DEFAULT NULL, 100 | `whois_server` varchar(512) DEFAULT NULL, 101 | `referral_url` varchar(512) DEFAULT NULL, 102 | `data_error` smallint(6) DEFAULT '0', 103 | PRIMARY KEY (`registry_data_id`), 104 | KEY `domain_name_index` (`domain_name`), 105 | UNIQUE KEY `audit_updated_date` (`audit_updated_date`), 106 | KEY `FK68C3166C7B556202` (`technical_contact_id`), 107 | KEY `FK68C3166C79B00024` (`billing_contact_id`), 108 | KEY `FK68C3166CB8CF12D0` (`admin_contact_id`), 109 | KEY `FK68C3166CD0C7A375` (`registrant_id`), 110 | KEY `FK68C3166C20710653` (`zone_contact_id`), 111 | KEY `data_error` (`data_error`), 112 | CONSTRAINT `FK68C3166C20710653` FOREIGN KEY (`zone_contact_id`) REFERENCES `contact` (`contact_id`), 113 | CONSTRAINT `FK68C3166C79B00024` FOREIGN KEY (`billing_contact_id`) REFERENCES `contact` (`contact_id`), 114 | CONSTRAINT `FK68C3166C7B556202` FOREIGN KEY (`technical_contact_id`) REFERENCES `contact` (`contact_id`), 115 | CONSTRAINT `FK68C3166CB8CF12D0` FOREIGN KEY (`admin_contact_id`) REFERENCES `contact` (`contact_id`), 116 | CONSTRAINT `FK68C3166CD0C7A375` FOREIGN KEY (`registrant_id`) REFERENCES `contact` (`contact_id`) 117 | ) ENGINE=InnoDB AUTO_INCREMENT=1 ROW_FORMAT=COMPRESSED DEFAULT CHARSET=utf8; 118 | /*!40101 SET character_set_client = @saved_cs_client */; 119 | 120 | -- 121 | -- Table structure for table `whois_record` 122 | -- 123 | 124 | /*!40101 SET @saved_cs_client = @@character_set_client */; 125 | /*!40101 SET character_set_client = utf8 */; 126 | CREATE TABLE IF NOT EXISTS `whois_record` ( 127 | `whois_record_id` bigint(20) NOT NULL AUTO_INCREMENT, 128 | `created_date` varchar(200) DEFAULT NULL, 129 | `updated_date` varchar(200) DEFAULT NULL, 130 | `expires_date` varchar(200) DEFAULT NULL, 131 | `admin_contact_id` bigint(11) DEFAULT NULL, 132 | `registrant_id` bigint(11) DEFAULT NULL, 133 | `technical_contact_id` bigint(11) DEFAULT NULL, 134 | `zone_contact_id` bigint(11) DEFAULT NULL, 135 | `billing_contact_id` bigint(11) DEFAULT NULL, 136 | `domain_name` varchar(256) CHARACTER SET latin1 DEFAULT NULL, 137 | `name_servers` text, 138 | `registry_data_id` bigint(11) DEFAULT NULL, 139 | `status` text, 140 | `raw_text` longtext, 141 | `audit_created_date` timestamp NULL DEFAULT NULL, 142 | `audit_updated_date` timestamp NULL DEFAULT NULL, 143 | `unparsable` longtext, 144 | `parse_code` smallint(6) DEFAULT NULL, 145 | `header_text` longtext, 146 | `clean_text` longtext, 147 | `footer_text` longtext, 148 | `registrar_name` varchar(512) DEFAULT NULL, 149 | `data_error` smallint(6) DEFAULT '0', 150 | PRIMARY KEY (`whois_record_id`), 151 | UNIQUE KEY `domain_name_index` (`domain_name`), 152 | KEY `audit_updated_date` (`audit_updated_date`), 153 | KEY `audit_created_date` (`audit_created_date`), 154 | KEY `FKE043A3087B556202` (`technical_contact_id`), 155 | KEY `FKE043A30879B00024` (`billing_contact_id`), 156 | KEY `FKE043A308C7212EEF` (`registry_data_id`), 157 | KEY `FKE043A308B8CF12D0` (`admin_contact_id`), 158 | KEY `FKE043A308D0C7A375` (`registrant_id`), 159 | KEY `FKE043A30820710653` (`zone_contact_id`), 160 | KEY `data_error` (`data_error`), 161 | CONSTRAINT `FKE043A30820710653` FOREIGN KEY (`zone_contact_id`) REFERENCES `contact` (`contact_id`), 162 | CONSTRAINT `FKE043A30879B00024` FOREIGN KEY (`billing_contact_id`) REFERENCES `contact` (`contact_id`), 163 | CONSTRAINT `FKE043A3087B556202` FOREIGN KEY (`technical_contact_id`) REFERENCES `contact` (`contact_id`), 164 | CONSTRAINT `FKE043A308B8CF12D0` FOREIGN KEY (`admin_contact_id`) REFERENCES `contact` (`contact_id`), 165 | CONSTRAINT `FKE043A308C7212EEF` FOREIGN KEY (`registry_data_id`) REFERENCES `registry_data` (`registry_data_id`), 166 | CONSTRAINT `FKE043A308D0C7A375` FOREIGN KEY (`registrant_id`) REFERENCES `contact` (`contact_id`) 167 | ) ENGINE=InnoDB AUTO_INCREMENT=1 ROW_FORMAT=COMPRESSED DEFAULT CHARSET=utf8; 168 | /*!40101 SET character_set_client = @saved_cs_client */; 169 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 170 | 171 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 172 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 173 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; 174 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 175 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 176 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 177 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 178 | 179 | -- Dump completed on 2012-01-10 5:12:00 180 | -------------------------------------------------------------------------------- /website_contactscats_to_mysqldb/load_contactscategories_jsonl_to_mysql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | A sampe script to populate a Website Contacts and Categories 4 | MySQL database from WhoisXML API data. 5 | (c) WhoisXML API, Inc. 2019. 6 | """ 7 | 8 | import sys 9 | import os 10 | import binascii 11 | import datetime 12 | import argparse 13 | import gzip 14 | import pandas as pd 15 | import mysql.connector as my 16 | 17 | VERSION = "0.0.1" 18 | MYNAME = sys.argv[0].replace('./', '') 19 | 20 | parser = argparse.ArgumentParser(description=''' 21 | A sampe script to populate a Website Contacts and Categories 22 | MySQL database from WhoisXML API data. 23 | See usage examles in the supplied README file.''', 24 | prog=MYNAME, 25 | formatter_class=argparse.RawTextHelpFormatter) 26 | 27 | # Mysql setup 28 | parser.add_argument('--version', 29 | help='Print version information and exit.', 30 | action='version', 31 | version=MYNAME + ' ver. ' + VERSION + '\n(c) WhoisXML API Inc.') 32 | parser.add_argument('--quiet', action='store_true', help='Suppress all informative messages.') 33 | parser.add_argument('--mysql-host', default='127.0.0.1', type=str, 34 | help='Host name or IP address to reach MySQL server (optional). Default: Localhost.') 35 | parser.add_argument('--mysql-port', default='3306', type=str, 36 | help='Port of MySQL database (optional). Default: 3306') 37 | parser.add_argument('--mysql-user', type=str, required=False, default='', 38 | help='User name to login to the MySQL database. Default: system user.') 39 | parser.add_argument('--mysql-password', type=str, required=True, default='', 40 | help='Password to login to the MySQL database') 41 | parser.add_argument('--mysql-database', type=str, required=True, 42 | help='The name of the MySQL database to load data into.') 43 | parser.add_argument('--mysql-errors', action='store_true', help='Print wrong SQL inserts.') 44 | parser.add_argument('--chunksize', type=int, help= 45 | 'Maximum size of chunks to be read from the file and committed into the DB at once. Default=100.000', 46 | default=100000) 47 | parser.add_argument('--nchunksmax', type=int, help= 48 | 'Number of chunks to load. Default=0, stands for all. Change for testing purposes.', 49 | default=0) 50 | parser.add_argument('--jsonl-file', type=str, required=True, 51 | help='The jsonl file to load') 52 | parser.add_argument('--categories-only', action='store_true', help='Categories only file. No contact info included.') 53 | args = parser.parse_args() 54 | 55 | 56 | 57 | def print_verbose(message): 58 | #Function to give some feedback 59 | if not args.quiet: 60 | sys.stderr.write( 61 | MYNAME + ' ' + datetime.datetime.now().strftime( 62 | '%Y-%m-%d %H:%M:%S') + ': ' + message + '\n') 63 | sys.stderr.flush() 64 | 65 | def is_gz_file(filepath): 66 | #Check if the file is gzipped by checking it magic number 67 | with open(filepath, 'rb') as test_f: 68 | return binascii.hexlify(test_f.read(2)) == b'1f8b' 69 | 70 | def getfield(dataobj, field_name): 71 | """Get the field from a data object 72 | or return None if it does not exist 73 | or is an empty string.""" 74 | try: 75 | result = dataobj.__getattribute__(field_name) 76 | except AttributeError: 77 | result = None 78 | try: 79 | if result.isspace(): 80 | result = None 81 | except AttributeError: 82 | pass 83 | return(result) 84 | 85 | def getdictval(dictionary, key): 86 | """Get a value from a dictionary or None if 87 | does not exist or is an empty string.""" 88 | try: 89 | result = dictionary[key] 90 | except KeyError: 91 | result = None 92 | if not result or not result.strip(): 93 | result = None 94 | return(result) 95 | 96 | #Some minor checks: if any files 97 | if args.jsonl_file is not None: 98 | if not os.path.isfile(args.jsonl_file): 99 | raise ValueError( 100 | 'the specified file "%s" does not exist'%args.jsonl_file) 101 | print_verbose('Will load data from %s\n'%args.jsonl_file) 102 | 103 | 104 | #Here we connect the database 105 | print_verbose("Opening db connection.") 106 | cnx = my.connect(user=args.mysql_user, password=args.mysql_password, 107 | host=args.mysql_host, database=args.mysql_database, 108 | port=args.mysql_port) 109 | cnx.set_charset_collation(charset='utf8mb4', collation='utf8mb4_unicode_ci') 110 | #Defining the cursor 111 | cursor = cnx.cursor(dictionary=True) 112 | print_verbose("Turning off foreign key checks.") 113 | cursor.execute("SET foreign_key_checks = 0") 114 | 115 | #Main job: inserting data 116 | nerrors = 0 117 | nchunks = 0 118 | nrecords = 0 119 | #Opening input file 120 | if is_gz_file(args.jsonl_file): 121 | print_verbose('Opening gzipped input file %s'%args.jsonl_file) 122 | infile = gzip.open(args.jsonl_file, 'rt', encoding='utf-8') 123 | else: 124 | print_verbose('Opening input file %s'%args.jsonl_file) 125 | infile = open(args.jsonl_file, 'rt') 126 | for chunk in pd.read_json(infile, chunksize=args.chunksize, lines=True, encoding='UTF-8'): 127 | records = [r for r in chunk.itertuples()] 128 | for r in records: 129 | #Main fields 130 | main_data = [getfield(r, 'domainName'), getfield(r, 'countryCode')] 131 | if main_data[0] is None: 132 | print_verbose("Record error: undefined domain name in "+ str(r)) 133 | error_count += 1 134 | continue 135 | if args.categories_only: 136 | cursor.execute( 137 | 'INSERT INTO domain(domainName, countryCode) values(%s,%s)', 138 | main_data) 139 | recordid=cursor.lastrowid 140 | else: 141 | #Meta fields 142 | main_data.append(getdictval(getfield(r, 'meta'), 'title')) 143 | main_data.append(getdictval(getfield(r, 'meta'), 'description')) 144 | #socialLinks fields 145 | main_data.append(getdictval(getfield(r, 'socialLinks'), 'facebook')) 146 | main_data.append(getdictval(getfield(r, 'socialLinks'), 'googlePlus')) 147 | main_data.append(getdictval(getfield(r, 'socialLinks'), 'instagram')) 148 | main_data.append(getdictval(getfield(r, 'socialLinks'), 'twitter')) 149 | main_data.append(getdictval(getfield(r, 'socialLinks'), 'linkedIn')) 150 | cursor.execute( 151 | 'INSERT INTO domain(domainName, countryCode, meta_title, meta_description, socialLinks_facebook, socialLinks_googlePlus, socialLinks_instagram, socialLinks_twitter, socialLinks_linkedIn) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)', 152 | main_data) 153 | recordid=cursor.lastrowid 154 | #Child records 155 | #emails 156 | for child in getfield(r, 'emails') or []: 157 | cursor.execute( 158 | 'INSERT INTO email(domainID, description, email) VALUES(%s,%s,%s)', ( 159 | recordid, 160 | getdictval(child, 'description'), 161 | getdictval(child, 'email'))) 162 | #phone numbers 163 | for child in getfield(r, 'phones') or []: 164 | cursor.execute( 165 | 'INSERT INTO phone(domainID, description, phoneNumber, callHours) VALUES(%s,%s,%s,%s)', ( 166 | recordid, 167 | getdictval(child, 'description'), 168 | getdictval(child, 'phoneNumber'), 169 | getdictval(child, 'callHours'))) 170 | #postal addresses 171 | for child in getfield(r, 'postalAddresses') or []: 172 | cursor.execute( 173 | 'INSERT INTO postalAddress(domainID, postalAddress) VALUES(%s,%s)', ( 174 | recordid, 175 | child)) 176 | #company names 177 | for child in getfield(r, 'companyNames') or []: 178 | cursor.execute( 179 | 'INSERT INTO companyName(domainID, companyName) VALUES(%s,%s)', ( 180 | recordid, 181 | child)) 182 | #Now upserting category 183 | for child in getfield(r, 'categories') or []: 184 | cursor.execute( 185 | 'INSERT IGNORE INTO category(category) VALUES(%s)', 186 | (child,)) 187 | cursor.execute( 188 | 'INSERT INTO domain_category(categoryID, domainID) VALUES(%s, %s)', 189 | (child, recordid)) 190 | nrecords += 1 191 | cnx.commit() 192 | print_verbose( 193 | 'Committed chunk %d (%d records of max %d)\n' % ( 194 | nchunks + 1, len(records), args.chunksize)) 195 | nchunks += 1 196 | if nchunks == args.nchunksmax: 197 | break 198 | 199 | print_verbose("Committed %d records into domains"%nrecords) 200 | print_verbose("Total number of errors: %d "%nerrors) 201 | print_verbose("Closing input file") 202 | infile.close() 203 | print_verbose("Turning on foreign key checks.") 204 | cursor.execute("SET foreign_key_checks = 1") 205 | print_verbose("closing db connection.") 206 | cnx.close() 207 | 208 | -------------------------------------------------------------------------------- /whoisxmlapi_whoisdownload_bash/README: -------------------------------------------------------------------------------- 1 | Changes 2 | ------ 3 | 4 | 0.0.26. 5 | o Added the "domains", "verified_domains", "reserved_domains", and 6 | "missing_domains" data formats to download domain lists from 7 | quarterly ccTLD and gTLD db releases 8 | o Modified base URLs to use https instead of http. 9 | 10 | 0.0.25. 11 | o Removed the use of "realpath" as it is not available on Mac OS X by default. 12 | 13 | 0.0.24. 14 | o Added support for data feeds domain_names_dropped_whois_archive 15 | and ngtlds_domain_names_dropped_whois_archive 16 | 17 | 0.0.23. 18 | o Messages and return codes for whois_database_combined feed 19 | revised again 20 | o Introduced the "thin" option to download data for tlds com and net 21 | from whois_database 22 | 23 | 0.0.22. 24 | o More consistent messages when downloading multipart archives 25 | especially from the whois_database_combined feed 26 | If there is just one file in the feed, the return code will be 2 now, 27 | otherwise it is zero. 28 | o Fixed to give 1 (instead of 0) return code for unhandled feed 29 | o Introduced a --show-progress option to have progress bars for the downloads 30 | 31 | 0.0.21. 32 | o More informative error messages 33 | o Minor code simplification 34 | o Fixed a but affecting explicit specification of ssl auth files 35 | in command-line or in variables 36 | 37 | 0.0.20. 38 | o Fixed some minor bugs 39 | o Fixed some return codes, to be more coherent with the python version: 40 | Now 41 | - return code 2 is for a file which is not found on the server (previously returned 0 or 2 sometimes) 42 | - in case of success return code 0 is returned (in case of daily feeds it was buggy) 43 | - return code 1 and those greater than 2 are for abnormal termination 44 | 45 | 0.0.19. 46 | o Fixed the following archive feeds to download data 47 | from the year-named direcotries of past years: 48 | -domain_names_whois_archive 49 | -domain_names_whois_filtered_reg_country_archive 50 | -domain_names_whois_filtered_reg_country_noproxy_archive 51 | -ngtlds_domain_names_whois_archive 52 | -ngtlds_domain_names_whois_filtered_reg_country_archive 53 | -ngtlds_domain_names_whois_filtered_reg_country_noproxy_archive 54 | o Added support for the following data feeds: 55 | -domain_names_diff_whois_filtered_reg_country2 56 | -cctld_discovered_domain_names_whois_archive 57 | -reported_for_removal 58 | o Implemented a "--list-supported-tlds" option 59 | 60 | 0.0.18. 61 | o Added support for ssl key authentication. 62 | 63 | 0.0.17: 64 | o Added domain_names_whois2 target. 65 | 66 | 0.0.16: 67 | o Handling 403: Forbidden errors in wget 68 | o Dry run emulates 10 part-files when downloading multipart archives 69 | (Can be configured with the DRY_RUN_MULTIFILE_LIMIT variable 70 | o Quarterly downloads require explicit database specifications 71 | 72 | 0.0.15: 73 | o Added daily ngtlds feeds such as: 74 | - ngtlds_domain_names_whois_filtered_reg_country 75 | - ngtlds_domain_names_whois_filtered_reg_country_noproxy 76 | - ngtlds_domain_names_whois_archive 77 | - ngtlds_domain_names_whois_filtered_reg_country_archive 78 | - ngtlds_domain_names_whois_filtered_reg_country_noproxy_archive 79 | 80 | 0.0.14: 81 | o Cctld newly discovered daily data domain_names_new and domain_names_whois added. 82 | o Cctld newly registered daily data domain_names_new, domain_names_whois, domain_names_dropped, 83 | and domain_names_dropped_whois added. 84 | o Fixed a bug with whois_database_combined not working exactly as intended. 85 | o domain_list_quarterly sql feed fixed 86 | 87 | 0.0.13: 88 | o Fixed bug where ngtlds feed was getting an error when downloading supported tlds 89 | 90 | 0.0.12: 91 | o Fixed bug where "bad feeds" were not being handled properly. 92 | o Add file format "all" which downloads all available file formats for a given feed. 93 | o Added full and sql file formats to domain_names_whois_archive feed. 94 | 95 | 0.0.11: 96 | o Added data feed support for domain_names_dropped_whois and ngtlds_domain_names_dropped_whois 97 | 98 | 0.0.10: 99 | o Added download support for legacy gtld quarterly data v1 and v2 100 | 101 | 0.0.9: 102 | o Input date format fixed. 103 | 104 | 0.0.8: 105 | o Added the feed whois_database_combined. Could not test because auth failed. 106 | 107 | 0.0.7: 108 | o Removed the --dry command line options from the tests. Now have to tests, 109 | ut_whoisdownload with dry run and ft_whoisdownload with full download. 110 | 111 | 0.0.6: 112 | o Added the --tld-file command line option. It it is provided it will expect a 113 | list of domains (e.g. asia,us,tel) in the file and will use that instead of 114 | the downloadable tld files. 115 | o Modified and enhanced the tests to check the new tld download methods. 116 | o If needed the supported_gtlds files supported_ngtlds are loaded from the 117 | directory where the script was started. This was wrong in the previous 118 | version. 119 | o Downloading of tlds for the whois_database feed is fixed. 120 | 121 | 0.0.5: 122 | o Added the new ways to download tld list. Need to be tested. 123 | 124 | 0.0.4: 125 | o All the date formats that are supported by the date(1) utility are now 126 | supported by the --date option. 127 | o Because of this I had to re-organize the code and so multiple dates now 128 | available by using the --date option more than once (e.g. 129 | --date="2015-10-20" --date="2015-10-21"). This date change proced to be more 130 | annoyance than I thought, sorry about that. 131 | o It is now possible to run the program without the --date option if the 132 | data feed needs no date to be set. 133 | o Added the long file format name versions, so from now on regular_csv, 134 | simple_csv, full_csv and mysqldump fiel formats are also accepted. 135 | o Some data sources dropped some older dates I used in tests so I had to 136 | modify my tests and re-run them. 137 | 138 | SSL authentication setup 139 | ------------------------ 140 | Consult README.SSL on how to set up ssl authentication if you have 141 | obtained the required files from WhoisXML API, INC. 142 | 143 | If you have set up ssl authentication, you can use the 144 | 145 | --auth-type=ssl 146 | 147 | instead of the --user and --password options in the examples below. 148 | 149 | Examples 150 | -------- 151 | Example 1 152 | Downloading the domain_names_new data source for all the top level domains at 153 | one specific date. 154 | 155 | ./whoisdownload.sh \ 156 | --user=demo \ 157 | --password=XXXXXX \ 158 | --date=2018-01-10 \ 159 | --output-dir=./tmp \ 160 | --data-feeds=domain_names_new 161 | 162 | 163 | Example 2 164 | Download the ngtlds_domain_names_new for three consecutive days for the 165 | abc and actor domains. 166 | 167 | ./whoisdownload.sh \ 168 | --user=demo \ 169 | --password=XXXXXX \ 170 | --tld="abc actor" \ 171 | --date=2018-01-10 \ 172 | --n=3 \ 173 | --output-dir=./tmp \ 174 | --data-feeds=ngtlds_domain_names_new 175 | 176 | 177 | Example 3 178 | Downloading the domain_names_new data source of .aero domain for 14 days 179 | starting at a specific date. 180 | 181 | ./whoisdownload.sh \ 182 | --user=demo \ 183 | --password=XXXXXX \ 184 | --date=2018-01-10 \ 185 | --output-dir=./tmp \ 186 | --tld=aero \ 187 | --n=14 \ 188 | --data-feeds=domain_names_new 189 | 190 | 191 | Example 4 192 | Downloading the domain_names_whois data source for all the supported tlds at a 193 | specific date. 194 | 195 | ./whoisdownload.sh \ 196 | --user=demo \ 197 | --password=XXXXXX \ 198 | --date=2018-01-10 \ 199 | --output-dir=./tmp \ 200 | --data-feeds=domain_names_whois 201 | 202 | 203 | Example 5 204 | Downloading files for two data sources, two domains and two dates, six 205 | downloads altogether. 206 | 207 | ./whoisdownload.sh \ 208 | --user=demo \ 209 | --password=XXXXXX \ 210 | --output-dir=./tmp \ 211 | --date="2018-01-20 2018-02-10" \ 212 | --tld="org info" \ 213 | --data-feeds="domain_names_new domain_names_dropped" 214 | 215 | 216 | Example 6 217 | 218 | Download the v20 version of the whois_database for one tld (The date 219 | argument is required but ignored.) 220 | 221 | ./whoisdownload.sh \ 222 | --verbose \ 223 | --user=demo \ 224 | --password=XXXXXX \ 225 | --file-format=simple \ 226 | --db-version=v20 \ 227 | --date=2018-01-01 \ 228 | --tld=tel \ 229 | --output-dir=./tmp \ 230 | --data-feeds=whois_database 231 | 232 | 233 | Example 7 234 | Download the v19 version of the whois_database for for all tlds. 235 | (The date argument is required but ignored.) 236 | 237 | ./whoisdownload.sh \ 238 | --verbose \ 239 | --user=demo \ 240 | --password=XXXXXX \ 241 | --file-format=simple \ 242 | --db-version=v19 \ 243 | --date=2018-01-01 \ 244 | --output-dir=./tmp \ 245 | --data-feeds=whois_database 246 | 247 | Example 8 248 | Download the v6 version of the cctld whois_database for for all tlds. 249 | (The date argument is required but ignored.) 250 | 251 | ./whoisdownload.sh \ 252 | --verbose \ 253 | --user=demo \ 254 | --password=XXXXXXX \ 255 | --file-format=simple \ 256 | --date=2018-01-01 \ 257 | --db-version=v6 \ 258 | --output-dir=./tmp \ 259 | --data-feeds=domain_list_quarterly 260 | 261 | Example 9 262 | Download the v6 version of the cctld whois_database for for one tld. 263 | (The date argument is required but ignored.) 264 | 265 | ./whoisdownload.sh \ 266 | --verbose \ 267 | --user=demo \ 268 | --password=XXXXXX \ 269 | --file-format=simple \ 270 | --date=2018-01-01 \ 271 | --db-version=v6 \ 272 | --tld=uk 273 | --output-dir=./tmp \ 274 | --data-feeds=domain_list_quarterly 275 | -------------------------------------------------------------------------------- /website_contactscats_to_mysqldb/load_contactscategories_jsonl_to_mysql.txt: -------------------------------------------------------------------------------- 1 | ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2 | LOAD_CONTACTSCATEGORIES_JSONL_TO_MYSQL.PY - A 3 | SCRIPT TO LOAD WEBSITE CONTACTS & CATEGORIES 4 | DATA INTO MYSQL 5 | 6 | 7 | WhoisXML API, Inc. 2019. 8 | ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9 | 10 | 11 | Table of Contents 12 | ───────────────── 13 | 14 | 1 About the script 15 | .. 1.1 A usable demonstration script 16 | .. 1.2 Cross-platform 17 | 2 Prerequisites 18 | .. 2.1 Python libraries 19 | .. 2.2 MySQL settings 20 | 3 How to use 21 | .. 3.1 Setting up a mysql database 22 | 4 Loading data with the script 23 | 5 Limitations 24 | 6 Performance notes 25 | 26 | 27 | version 0.0.1 28 | 29 | 30 | 1 About the script 31 | ══════════════════ 32 | 33 | This script is intended for subscribers of Whois XML's Website 34 | Contacts & Categories database download product 35 | ([http://website-contacts-database.whoisxmlapi.com]) to help loading 36 | downloaded jsonl files into a MySQL database. 37 | 38 | 39 | 1.1 A usable demonstration script 40 | ───────────────────────────────── 41 | 42 | The script is intended to be a programming example of how to 43 | accomplish this task and also to be useful as it is. For this reason 44 | the script is not perfectly robust, there are no checks of the 45 | validity of the input files and the exception handling is also not 46 | very detailed, to avoid a complex and less readable code. If used 47 | properly, however, it is can be used efficiently. 48 | 49 | 50 | 1.2 Cross-platform 51 | ────────────────── 52 | 53 | The script has to work on any system where Python and the necessary 54 | libraries are available. It has been tested on Ubuntu Linux and 55 | Microsoft Windows, but on other platforms such as Mac OS X it should 56 | work, too. 57 | 58 | 59 | 2 Prerequisites 60 | ═══════════════ 61 | 62 | 2.1 Python libraries 63 | ──────────────────── 64 | 65 | The loader script is written in Series 3 Python; it was tested with 66 | Python 3.6.7. on Linux and Python 3.7.2 on Windows. It uses the 67 | following libraries: 68 | 69 | • Pandas ([https://pandas.pydata.org/]) , a data analysis library, in 70 | order to efficiently load chunks of jsonl files. 71 | • MySQL connector: the library to access MySQL databases, provided by 72 | Oracle. 73 | 74 | While Pandas can be simply installed with the package manager ("pip 75 | install pandas"), the vanilla MySQL connector is available from its 76 | download web-page (dev.mysql.com/downloads/connector/python/) . On 77 | some systems you may install both with the package manager of your OS 78 | (e.g. with "apt" on Debian-flavor Linuxes, including Ubuntu and Mint). 79 | 80 | 81 | 2.2 MySQL settings 82 | ────────────────── 83 | 84 | As the data can contain Unicode characters to be stored on 4 bytes, 85 | while MySQL uses a 3-byte encoding by default, it is recommended to 86 | enable the 4-byte Unicode system-wide, by adding the lines 87 | ┌──── 88 | │ character-set-server = utf8mb4 89 | │ collation-server = utf8mb4_unicode_ci 90 | └──── 91 | to the configuration file of the MySQL server 92 | (e.g. /etc/mysql/mysql.conf.d/mysqld.cnf on Ubuntu systems), and 93 | restarting the service. Please consult the documentation of MySQL if 94 | your prefer not to modify this setting system-wide. 95 | 96 | 97 | 3 How to use 98 | ════════════ 99 | 100 | We demonstrate the use of the script using the demonstration data 101 | available from the website. There art two kinds of datasets available: 102 | Domain names only: These contain domain names, country codes and 103 | categories to which the given site belongs to. A site 104 | can belong to multiple categories. 105 | Domain names and contact information: these contain full information 106 | including e-mails, postal 107 | addresses, social network links, 108 | etc. as described in the product 109 | specs: 110 | [https://website-contacts-database.whoisxmlapi.com/specifications] 111 | The script supports both types of files. In this demonstration we 112 | assume that the two sample files: categories_database_sample.jsonl 113 | (domain names only) and contacts_database_sample.jsonl (full data) are 114 | to be loaded into a MySQL database. In a production environment the 115 | desired files should be used instead. 116 | 117 | The script automatically detects and decompresses gzip compressed 118 | files, so if you have downloaded a large file in this format, there is 119 | no need to uncompress it in advance. 120 | 121 | The following description has been prepared on a Ubuntu Linux 122 | system. The script works on Windows, too, provided that Python, MySQL 123 | and the necessary packages are installed. In the Windows command-line 124 | the "./" before the script's name is not needed. 125 | 126 | 127 | 3.1 Setting up a mysql database 128 | ─────────────────────────────── 129 | 130 | We create a user and a database for the purpose: as a root user we do 131 | ┌──── 132 | │ create user websitecc identified by 'websitecc'; 133 | │ create database websitecc; 134 | │ grant all on websitecc.* to websitecc; 135 | └──── 136 | Next we create the schema for loading data. For this reason we run the 137 | appropriate ddl file supplied for this script: 138 | ┌──── 139 | │ mysql --user=websitecc --password=websitecc --database=websitecc < website_categories.ddl 140 | └──── 141 | for domain-names only data or 142 | ┌──── 143 | │ mysql --user=websitecc --password=websitecc --database=websitecc < website_contacts_categories.ddl 144 | └──── 145 | for complete data. This will create a simple schema with an n-m 146 | connection in the domains only case: [./website_categories_schema.png] 147 | 148 | and a similar schema with a more detailed "domain" table having more 149 | children when contact data are also included: 150 | [./website_contats_categories_schema.png] 151 | 152 | (Note: we use TEXT fields because of the unpredictable lengths and 153 | LONGBLOBs because of the unpredictable character sets. This should be 154 | taken into account when putting indices on these fields.) 155 | 156 | 157 | 4 Loading data with the script 158 | ══════════════════════════════ 159 | 160 | Once the database has been prepared as described, and the data have 161 | been downloaded, they can be loaded into the database with the 162 | script. The script is self-documenting, it supports the –help option: 163 | ┌──── 164 | │ ./contactscategories_jsonl_to_mysql.py --help 165 | └──── 166 | gives information about the syntax and a full list of currently 167 | supported options. 168 | 169 | A typical way of loading domain names only is 170 | ┌──── 171 | │ ./load_contactscategories_jsonl_to_mysql.py \ 172 | │ --jsonl-file categories_database_sample.jsonl --domain-names-only \ 173 | │ --mysql-password websitecc --mysql-database websitecc \ 174 | │ --mysql-user websitecc 175 | └──── 176 | whereas loading full data can be done with 177 | ┌──── 178 | │ ./load_contactscategories_jsonl_to_mysql.py \ 179 | │ --jsonl-file contacts_database_sample.jsonl \ 180 | │ --mysql-password websitecc --mysql-database websitecc \ 181 | │ --mysql-user websitecc 182 | └──── 183 | Notes: 184 | • The –domain-names-only option can be used with files having full 185 | information and/or schemata with full information. In this case the 186 | contact information shall be ignored. The other way around, trying 187 | to load domain names only files without this option will result in 188 | error messages and malfunction. 189 | 190 | • The script loads lines of the files in chunks. Commits occur after 191 | each chunk; this is a typical approach in relational data 192 | population. The size of chunks can be tuned with the –chunksize 193 | option, and the number of chunks to be loaded can be limited by the 194 | –nchunks option. 195 | 196 | • Foreign key checks are turned off at the beginning and turned on at 197 | the end for better performance. 198 | 199 | • The categories are inserted and updated dynamically. 200 | 201 | • If you encounter "Memory error", decreasing the chunk size can 202 | help. This error occurs when you have a huge file to load. 203 | 204 | 205 | 5 Limitations 206 | ═════════════ 207 | 208 | The script is mainly for demonstration, however it can be used in 209 | practice. It has, however, the following limitations. 210 | 211 | • The schema is hard-coded; the table names should be the as in the 212 | provided ddl-s. It can be easily customized by rewriting the script. 213 | 214 | • The script can only populate databases, apart from the categories it 215 | does not verify whether a record already exists. 216 | 217 | • It reads records with children line by line; thus e.g. e-mails 218 | belonging to multiple records will be duplicated. It violates 219 | normalization. In the future there will be a mode of the script to 220 | overcome this, but it will be optional due to performance reasons: 221 | in many cases it is more efficient to load the data as they are and 222 | take account the unnormalized nature. 223 | 224 | • The script does not yet support multi-threaded operation. 225 | 226 | 227 | 6 Performance notes 228 | ═══════════════════ 229 | 230 | During the testing with a large file we had the following experience. 231 | 232 | The test was run on Ubuntu 18.04.1 LTS, mysqld Ver 233 | 5.7.24-0ubuntu0.18.04.1 for Linux on x86_64 (Ubuntu), Python 3.6.7, 234 | pandas 0.23.4, mysql.connector 2.1.6; on a with Intel(R) Core(TM) 235 | i7-7700 CPU @ 3.60GHz, 4 gigabytes of RAM, running in a Virtualbox 236 | environment hosted on the same version of Linux, on a Dell Precision 237 | 3620 Mini Tower workstation. 238 | 239 | Loading 12921323 records of contacts and categories from a gzipped 240 | jsonl file of size 951M took about 3 hours. 241 | 242 | We also remark that under Windows 10 on the same (virtual) hardware we 243 | encountered "Memory error", so probably a Windows system needs more 244 | memory for this task. 245 | -------------------------------------------------------------------------------- /whoisxmlapi_percona_loader_scripts/load_whois_percona.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | #Sample script to load binary mysql dumps for a tld 4 | # 5 | #Copyright (c) 2010-2021 Whois API LLC, http://www.whoisxmlapi.com 6 | # 7 | #Note: IF YOU ARE READING THIS SCRIPT JUST TO COLLECT IDEAS FOR YOUR OWN LOADER, 8 | # VISIT THE END OF THE FILE WHERE THE REAL WORK IS DONE 9 | # 10 | # Global variables. 11 | # 12 | LANG=C 13 | LC_ALL=C 14 | VERSION="0.0.3" 15 | VERBOSE="no" 16 | DEBUG="no" 17 | MYNAME=$(basename $0) 18 | 19 | #No mysql stuff by default. This is set by mandatory args. 20 | unset MYSQL_USER 21 | unset MYSQL_PASSWORD 22 | #Default mysql data directory 23 | MYSQL_DATA_DIR=/var/lib/mysql 24 | #DEFAULT MYSQL START STOP COMMAND 25 | MYSQL_STOP_COMMAND="/etc/init.d/mysql stop" 26 | MYSQL_START_COMMAND="/etc/init.d/mysql start" 27 | #Dry run: do not touch the database 28 | DRY_RUN="No" 29 | #Importing generic utilities 30 | 31 | source load_mysql_utils.sh 32 | 33 | function printHelpAndExit() 34 | { 35 | echo "Usage: $MYNAME [OPTION]..." 36 | echo "$MYNAME -- loads data for given tlds" 37 | echo "from a binary mysql dump " 38 | echo " into a table in a mysql database." 39 | echo "" 40 | echo " -h, --help Print this help and exit." 41 | echo " -v, --version Print version information and exit." 42 | echo " --verbose Print more messages. Recommended." 43 | echo " --debug Print extensive debug messages" 44 | echo " --dry-run Dry run: not touching the db, just verifying and extracting data" 45 | echo " --mysql-user=USERNAME User name to login to the mysql database (optional)." 46 | echo " --mysql-password=PASSWORD Password to login to the data source (optional)." 47 | echo " --mysql-data-dir=DIRECTORY The directory where mysql stores its data. " 48 | echo " default: /var/lib/myqsl" 49 | echo " You should have write permission on it" 50 | echo " --import-data-dir=DIRECTORY The dump files for the tld-s are in this directory." 51 | echo " --schema-file=SCHEMAFILE The schema file to be used when loading." 52 | echo " Defaults to the file load_binary_whois_dumps.sh" 53 | echo " in the same directory where the script is." 54 | echo " --tlds Comma-separated list of tlds to load." 55 | echo " --db-version=STRING The version to load download. Required for --tld Format: vNN, e.g. v19" 56 | echo "" 57 | echo "Consult the supplied REDAME.txt for a detailed description." 58 | echo "" 59 | exit 1 60 | } 61 | 62 | ARGS=$(\ 63 | getopt -o hv \ 64 | -l "help,verbose,debug,version,v,dry-run,mysql-user:,mysql-password:,\ 65 | mysql-start-command:,mysql-stop-command:,import-data-dir:,schema-file:,tlds:,db-version:" \ 66 | -- "$@") 67 | 68 | if [ $? -ne 0 ]; then 69 | exit 6 70 | fi 71 | 72 | eval set -- "$ARGS" 73 | 74 | while true; do 75 | case "$1" in 76 | -h|--help) 77 | shift 78 | printHelpAndExit 79 | ;; 80 | 81 | --verbose) 82 | shift 83 | VERBOSE="true" 84 | ;; 85 | 86 | --debug) 87 | shift 88 | DEBUG="yes" 89 | MYSQL_VERB_ARG="--verbose" 90 | ;; 91 | 92 | --dry-run) 93 | shift 94 | DRY_RUN="Yes" 95 | ;; 96 | 97 | -v|--version) 98 | shift 99 | printVersionAndExit 100 | ;; 101 | 102 | --mysql-user) 103 | shift 104 | db_username=$1 105 | shift 106 | ;; 107 | 108 | --mysql-password) 109 | shift 110 | export MYSQL_PWD=$1 111 | shift 112 | ;; 113 | 114 | --mysql-start-command) 115 | shift 116 | export MYSQL_START_COMMAND=$1 117 | shift 118 | ;; 119 | 120 | --mysql-stop-command) 121 | shift 122 | export MYSQL_STOP_COMMAND=$1 123 | shift 124 | ;; 125 | 126 | --mysql-data-dir) 127 | shift 128 | MYSQL_DATA_DIR=$1 129 | if ! [ -d "$MYSQL_DATA_DIR" ]; then 130 | printError "The specified mysql data directory does not exist." 131 | exit 1 132 | fi 133 | shift 134 | ;; 135 | 136 | --tlds) 137 | shift 138 | TLDS=$1 139 | shift 140 | ;; 141 | 142 | --import-data-dir) 143 | shift 144 | IMPORT_DATA_DIR=$1 145 | if ! [ -d "$IMPORT_DATA_DIR" ]; then 146 | printError "The specified data directory does not exist." 147 | exit 1 148 | fi 149 | shift 150 | ;; 151 | 152 | --schema-file) 153 | shift 154 | export SCHEMA_FILE=$1 155 | shift 156 | ;; 157 | 158 | --db-version) 159 | shift 160 | #format check 161 | if echo $1 | grep --quiet -e "v[0-9]*"; then 162 | DATABASE_VERSION=$1 163 | else 164 | printError "Invalid db-version specification. It should be like v19 or v6" 165 | exit 1 166 | fi 167 | shift 168 | ;; 169 | 170 | --) 171 | shift 172 | break 173 | ;; 174 | 175 | *) 176 | ;; 177 | esac 178 | done 179 | #preliminary checks 180 | #Check if we can write mysql's data directory 181 | if ! [ -w "$MYSQL_DATA_DIR" ] && [ "$DRY_RUN" == "No" ];then 182 | printError "You cannot write the mysql data directory. Perhaps you should run this script as root." 183 | exit 1 184 | fi 185 | 186 | #We need the database version 187 | if [ -z "$DATABASE_VERSION" ];then 188 | printError "Please specify --db-version, e.g. --db-version = v20." 189 | printError "See also the output of " 190 | printError "$MYNAME --help" 191 | exit 1 192 | fi 193 | 194 | #Set up mysql login credentials if needed 195 | 196 | if [ -n "$db_username" ]; then 197 | MYSQL_ARGUMENTS="--user=$db_username" 198 | fi 199 | if [ -n "$MYSQL_PWD" ]; then 200 | MYSQL_ARGUMENTS="$MYSQL_ARGUMENTS --password=$MYSQL_PWD" 201 | fi 202 | printDebug "Mysql arguments: $MYSQL_ARGUMENTS" 203 | 204 | 205 | if [ $DRY_RUN == "Yes" ];then 206 | if [ $DEBUG == "yes" -o $VERBOSE == "true" ];then 207 | mysql_here="echo MySQL would do mysql" 208 | else 209 | mysql_here="true" 210 | fi 211 | else 212 | mysql_here="mysql $MYSQL_ARGUMENTS $MYSQL_VERB_ARG" 213 | fi 214 | printDebug "Mysql: $mysql_here" 215 | 216 | #Check for the schema file 217 | 218 | if [ -z $SCHEMA_FILE ];then 219 | SCHEMA_FILE="whoiscrawler_mysql_schema.sql" 220 | fi; 221 | 222 | if ! [ -f $SCHEMA_FILE ];then 223 | printError "The scema file $SCHEMA_FILE is not found" 224 | exit 1 225 | else 226 | printVerbose "Schema file: $SCHEMA_FILE" 227 | fi 228 | 229 | #Parse the list of tlds 230 | TLDS=$(echo $TLDS | sed -e s/,/\ /g | sed -e "s/\\./_/g") 231 | printDebug "TLDS: $TLDS" 232 | 233 | FILELIST="contact.ibd registry_data.ibd whois_record.ibd domain_names_whoisdatacollector.ibd" 234 | 235 | for TLD in $TLDS;do 236 | printVerbose "Checking data for tld: $TLD" 237 | TLDDOT=$(echo $TLD | tr _ .) 238 | if [ -f $IMPORT_DATA_DIR/$TLDDOT.7z ];then 239 | printVerbose "Compressed data found for domain $TLD, uncompressing" 240 | wd=$(pwd) 241 | cd $IMPORT_DATA_DIR 242 | p7zip -d $TLDDOT.7z 243 | cd $wd 244 | fi 245 | #For gtlds and cctlds we have different naming conventions 246 | if [ -d "$IMPORT_DATA_DIR"/whoiscrawler_"$DATABASE_VERSION"_"$TLD" ];then 247 | TLDDIR="$IMPORT_DATA_DIR"/whoiscrawler_"$DATABASE_VERSION"_"$TLD" 248 | else 249 | TLDDIR="$IMPORT_DATA_DIR"/domains_whoiscrawler_"$DATABASE_VERSION"_"$TLD" 250 | fi 251 | printDebug "TLD subdirectory:" $TLDDIR 252 | 253 | printVerbose "Checking files in $TLDDIR" 254 | for FILE in $FILELIST;do 255 | printDebug "Checking $TLDDIR/$FILE" 256 | if ! [ -f "$TLDDIR/$FILE" ];then 257 | printError "File $TLDDIR/$FILE is missing." 258 | exit 1 259 | fi 260 | done 261 | printVerbose "Files for $TLD are found, OK." 262 | done 263 | 264 | #At this point we have all the files and all the information to load the database 265 | #so we can do the 266 | #REAL JOB: 267 | TABLES="contact domain_names_whoisdatacollector registry_data whois_record" 268 | G_START_TIME=$(date +%s) 269 | 270 | for TLD in $TLDS;do 271 | printVerbose "Loading data for tld: $TLD" 272 | #For gtlds and cctlds we have different naming conventions 273 | if [ -d "$IMPORT_DATA_DIR"/whoiscrawler_"$DATABASE_VERSION"_"$TLD" ];then 274 | TLDDIR="$IMPORT_DATA_DIR"/whoiscrawler_"$DATABASE_VERSION"_"$TLD" 275 | DB=whoiscrawler_"$DATABASE_VERSION"_"$TLD" 276 | else 277 | TLDDIR="$IMPORT_DATA_DIR"/domains_whoiscrawler_"$DATABASE_VERSION"_"$TLD" 278 | DB=domains_whoiscrawler_"$DATABASE_VERSION"_"$TLD" 279 | fi 280 | printVerbose "Creating database $DB." 281 | $mysql_here -e "CREATE DATABASE $DB" 282 | printVerbose "Loading schema." 283 | $mysql_here "$DB" < "$SCHEMA_FILE" 284 | 285 | printVerbose "importing tablespaces" 286 | G_START_TIME=$(date +%s) 287 | for table in $TABLES; do 288 | START_TIME=$(date +%s) 289 | q="set FOREIGN_KEY_CHECKS=0;ALTER TABLE $DB.$table DISCARD TABLESPACE;" 290 | printDebug "$q" 291 | $mysql_here -e "$q" 292 | file="$table.ibd" 293 | printVerbose "Copying table file $file from $TLDDIR to $MYSQL_DATA_DIR/$DB" 294 | if [ $DRY_RUN == "No" ];then 295 | printVerbose "Stopping MySQL server before copying." 296 | $MYSQL_STOP_COMMAND 297 | cp "$TLDDIR/$file" "$MYSQL_DATA_DIR/$DB/." 298 | chown -R mysql:mysql "$MYSQL_DATA_DIR/$DB" 299 | printVerbose "Starting MySQL again." 300 | $MYSQL_START_COMMAND 301 | fi 302 | printVerbose "Importing tablesapce" 303 | q="ALTER TABLE $DB.$table IMPORT TABLESPACE" 304 | printDebug "$q" 305 | $mysql_here -e "$q" 306 | 307 | END_TIME=$(date +%s) 308 | DUR=$((END_TIME-START_TIME)) 309 | printVerbose "Import of the table $table for the tld $TLD took $DUR seconds" 310 | done 311 | done 312 | 313 | G_END_TIME=$(date +%s) 314 | GDUR=$((G_END_TIME-G_START_TIME)) 315 | printVerbose "$MYNAME has finished in $GDUR seconds." 316 | 317 | -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/SPECIFICATIONS.txt: -------------------------------------------------------------------------------- 1 | SPECIFICATIONS.txt for 2 | 3 | download_whois_data.py 4 | 5 | Copyright (c) 2010-2021 Whois API LLC, http://www.whoisxmlapi.com 6 | ------------------------------------------------------------------- 7 | 8 | This document is intended for developers and advanced users. 9 | 10 | It provides a declarative specification. The requirements which the 11 | script is supposed to meet by design are described. The business logic 12 | of the download process is outlined. The return codes of the script 13 | are listed and the possible causes are defined. 14 | 15 | For a brief guide on how to use the program consult "README.txt". 16 | 17 | Contents: 18 | --------- 19 | 20 | 1. Requirements met by the downloader script 21 | 22 | 2. The downloading process 23 | 24 | 3. Exit codes 25 | 26 | 1. Requirements met by the downloader script 27 | -------------------------------------------- 28 | 29 | The script is designed to be a simple utility to support http based 30 | downloading of data from the feeds provided by WhoisXML API, Inc. 31 | 32 | It is designed to meet the following requirements: 33 | 34 | -The script is cross-platform. It is supposed to run on any platform 35 | on which Python >= 3.6.x (or the legacy Python >=2.7.x) and the 36 | dependencies of the script are available. 37 | 38 | -All its functions are available from command-line using command-line 39 | arguments. 40 | 41 | -It is possible to operate the script with a series of GUI dialogs 42 | instead of command-line parameters. 43 | 44 | -It is subscription-independent: all possible data feeds and formats 45 | are offered, regardless of the type of the subscription used for the 46 | authentication. The script does not verify permissions, it reports an 47 | error if the access is denied to the given resource. 48 | 49 | -It supports plain http access with simple http authentication as well 50 | as https access with ssl key-based authentication. 51 | 52 | -The available data are specified in a config file (feeds.ini) 53 | provided with the scripts. The feeds.ini file is the part of the 54 | distribution, the end-users are not supposed to modify it. 55 | 56 | -The script determines a list of files to be downloaded offline, based 57 | on the feeds' configuration and the parameters provided (in 58 | command-line arguments or in the dialog utility) before the actual 59 | download process. 60 | 61 | -The files in the target directory are arranged in the same directory 62 | structure as on the server. 63 | 64 | -The script reports the list of files which were not possible to 65 | download at the end of its operation. This is not necessary an error 66 | as it can be normal as the predetermined list may contain files which 67 | only exist under certain circumstances. 68 | 69 | -The script does not verify the dates and quarterly database versions 70 | specified. A wrong specification results in error messages or 71 | reports on files which were not possible to download. 72 | 73 | -It downloads md5 checksums before each file whenever available. Files 74 | which already exist in the target directory are newly downloaded if 75 | and only if their checksum differs from the checksum on the 76 | server. In this way it can be used to verify or synchronize a local 77 | file set. 78 | 79 | -The downloading of already existing (partial) files is resumed by 80 | default. This can be overridden by the --no-resume option: in this 81 | case, existent files not matching their md5 sums are downloaded again 82 | from scratch. 83 | 84 | 85 | 2. The download process 86 | ----------------------- 87 | 88 | The script follows a streamline procedure for downloading all needed 89 | data. It is outlined in this section. 90 | 91 | Phase I: Preparation 92 | 93 | The target directory is not modified in this phase. 94 | 95 | The parameters provided by the user are read, either by parsing the 96 | command-line arguments or through a sequence of dialog windows 97 | depending on the mode of operation. During and after this process 98 | there are some consistency checks. Upon the failure of any of these 99 | checks the script does terminates with an error: exits with an error 100 | message and an error code, see Section 3 for the list of error codes. 101 | 102 | In command-line mode the following procedure occurs after parsing and 103 | checking the arguments. (In interactive mode these steps are part of 104 | the verification process during the interaction with the user.) 105 | 106 | Based on this information the feed downloader components of the 107 | scripts are initialized. (These are objects belonging to the 108 | "WhoisDataFeed" class). 109 | 110 | The supplied login credentials are verified by downloading the access 111 | test file of each of the feeds (specified as the "access_test_file" 112 | attribute in feeds.ini. Failure of any of these checks results in 113 | termination with an error. 114 | 115 | The list of available tlds for each feed is determined by downloading 116 | the files specifying the actual list of supported tlds (specified in 117 | the "supported_tlds_url" in feeds.ini). In case of daily feeds, it is 118 | also possible to determine the list based on the list of TLDs that 119 | contain changes on the given day. The --only-changed option results in 120 | this behavior. In this case, the list in the "alt_tlds_url" of the 121 | feeds.ini file will be used. When trying to download with the 122 | --only-changed option from a feed that has no "alt_tlds_url" 123 | specified, an error will occur. If the list of supported tlds is date 124 | dependent, the supported tlds list is considered to be the superset of 125 | all of these. The tlds for which the download will be carried out will 126 | be the intersection of this set with the set specified by the user. 127 | 128 | By the end of this phase the list of files to be downloaded is 129 | determined. 130 | 131 | Phase II: downloading the files 132 | 133 | The script loops through the list of files to be downloaded. 134 | 135 | For each file, the following logic is followed: 136 | 137 | 0. In case of daily feeds for which a mechanism indicating whether the 138 | data of the feed on the given day in the given format exists, the 139 | script checks if the data are complete. If the data are incomplete, a 140 | warning will be given. If the --no-premature option is used, the 141 | download of the data in the given format from the given feed and the 142 | given day will be skipped. 143 | 144 | 1. The md5 checksum of the file is downloaded from the server (Most 145 | feeds support it, this step is skipped if the feed does not support 146 | md5 checksums.) 147 | 148 | 2. The consistency of the local file with the same md5 checksum is 149 | verified. If the file is there and consistent with its md5 checksum, 150 | the downloading of the file is considered as complete and the process 151 | is finished for this file. 152 | 153 | 3. If the file does not exist, there is no md5 file or the md5 check 154 | fails, it is verified that the number of download attempts for this 155 | file does not exceed 3 (or the number specified in the --maxtries 156 | option). If it is exceeded, the file considered as unavailable and the 157 | process is finished for the file. 158 | 159 | 4. The downloading of the file is initiated after the download process 160 | (regardless of its success), the process is repeated from step 1 for 161 | the file. By default the downloading of an existent, possibly broken 162 | download is resumed. 163 | 164 | Phase III: report and exit. 165 | 166 | In verbose mode a list of files which were unavailable is reported. 167 | The script terminates with an exit code 0 if all files were checked 168 | and found O.K. or have been downloaded if necessary. A return code 2 169 | is generated if there were unavailable files. 170 | 171 | 3. Exit codes 172 | ------------- 173 | 174 | 0: Normal termination. 175 | 176 | 1: Abnormal termination. 177 | This is the error code given upon most errors. 178 | - No feed is specified for downloading 179 | - The feed specified for downloading does not exist (Invalid feed.) 180 | - No data format is specified for downloading. 181 | - The chosen feed does not support the chosen data format. 182 | - The database version of a quarterly feed is not specified or 183 | it is not of the expected form 184 | (character "v" followed by a number, e.g. v20) 185 | - The start date for downloading from a daily feed is not specified or it is 186 | not in the format "YYYYMMDD" 187 | - The start end for downloading from a daily is given but 188 | not in the format "YYYYMMDD" 189 | - The end date of the interval for daily feed downloading is earlier than 190 | the start date. 191 | - Login failed due to bad login name or password, or bad SSL credentials 192 | - The specified quarterly database does not exist 193 | (results in a "Login failed message") 194 | - The list of tlds to be downloaded is not specified. 195 | - No tlds specified for downloading are supported by the feed. 196 | - The output directory is not specified or does not exist. 197 | - Invalid feed configuration file, one or more feeds are ill-defined. 198 | - Password auth chosen, no password given and ~/.whoisxmlapi_login.ini does 199 | not exist. 200 | - The SSL credentials for authentication are invalid. 201 | - The download session is not open when downloading 202 | (internal issue, should not occur). 203 | - Database version specified for a daily feed 204 | (internal issue, should not occur). 205 | - Time interval specified for a quarterly feed 206 | (internal issue, should not occur). 207 | - The supported_tlds file for the feed cannot be downloaded. 208 | 209 | 2: Normal termination but some files which can or should be there according 210 | to the specification were not there. 211 | (Possible causes: no file on the server because not yet generated, 212 | no file on the server because there was no change in the tld on 213 | the given day, etc.) 214 | 215 | 3: Premature daily data. Some daily data in some format for some days 216 | were not yet finalized on the server. 217 | 218 | 6: Informational or canceled action. 219 | Possible causes are: 220 | - In interactive mode, "Cancel" was pressed in a dialog window. 221 | - The program was invoked with the --list-feeds option 222 | - The program was invoked with the --list-dataformats option 223 | - The program was invoked with the --list-tlds option 224 | 225 | 226 | -------------------------------------------------------------------------------- /whoisxmlapi_download_whois_data/whois_utils/whois_web_download_utils.py: -------------------------------------------------------------------------------- 1 | # Web download module of Whois API LLC end user scripts 2 | # 3 | # Copyright (c) 2010-2021 Whois API LLC, http://www.whoisxmlapi.com 4 | # 5 | 6 | from __future__ import print_function 7 | 8 | try: 9 | from urllib.parse import urlparse 10 | except ImportError: 11 | from urlparse import urlparse 12 | try: 13 | from HTMLParser import HTMLParser 14 | except ImportError: 15 | from html.parser import HTMLParser 16 | 17 | import requests 18 | import os, hashlib, re 19 | import datetime 20 | import time 21 | 22 | import whois_utils.whois_user_interaction as whois_user_interaction 23 | from whois_utils.whois_user_interaction import * 24 | whois_user_interaction.VERBOSE = True 25 | whois_user_interaction.DEBUG = True 26 | 27 | class Indexparser(HTMLParser): 28 | """This parser parses an autoindexed directory and finds files 29 | which are supposed to be the attributes not having a slash in them""" 30 | FileList = [] 31 | def reset_filelist(self): 32 | self.FileList = [] 33 | def handle_starttag(self, tag, attrs): 34 | if tag == 'a': 35 | for attr in attrs: 36 | if not re.search(r'/', attr[1]): 37 | self.FileList.append(attr[1]) 38 | 39 | Index_Parser = Indexparser() 40 | 41 | def md5_check( path_filename, md5_file_path ): 42 | """ Determines if the md5 checksum checks out 43 | 44 | Return: 45 | Returns a true if the checksum is correct, 46 | false if it is wrong or either the file or the checksum does not exist. 47 | """ 48 | try: 49 | calc_check_sum = calc_md5( path_filename ) 50 | with open( md5_file_path ) as md5_file: 51 | correct_check_sum = md5_file.readline().split()[0].strip() 52 | if( calc_check_sum == correct_check_sum ): 53 | print_verbose("MD5 check passed for %s"%path_filename) 54 | return True 55 | print_verbose("MD5 check failed for %s"%path_filename) 56 | return False 57 | except Exception as e: 58 | print_verbose("Exception in MD5 check for %s:\n%s"%(path_filename,str(e))) 59 | return False 60 | 61 | def calc_md5( path_filename ): 62 | """ Calculates the md5 of a file 63 | 64 | Return: 65 | Returns the hex digits in string form representing md5 of file 66 | """ 67 | hash_md5 = hashlib.md5() 68 | with open( path_filename , "rb") as f: 69 | for chunk in iter(lambda: f.read(4096), b""): 70 | hash_md5.update(chunk) 71 | return hash_md5.hexdigest() 72 | 73 | 74 | def web_download_and_check_file(url, md5url, session, output_dir, maxtries, no_resume): 75 | """Given a session, downloads the file and its md5. If it fails according to the md5, retries maxtries times""" 76 | filename = os.path.basename(urlparse(url).path) 77 | filename = os.path.abspath(os.path.join(output_dir, filename)) 78 | if md5url != None: 79 | md5filename = os.path.basename(urlparse(md5url).path) 80 | md5filename = os.path.abspath(os.path.join(output_dir, md5filename)) 81 | else: 82 | md5filname=None 83 | 84 | gotit = False 85 | force = False 86 | giveup = False 87 | resume = not no_resume 88 | ntries = 0 89 | while not gotit and not giveup and ntries < maxtries: 90 | print_verbose('Verified download of %s: attempt #%d' % (url, ntries+1)) 91 | gotfile = web_download_file(url, session, output_dir, maxtries, force, resume=resume) 92 | if md5url != None: 93 | gotmd5 = web_download_file(md5url, session, output_dir, maxtries, True, resume=False) 94 | else: 95 | gotmd5 = False 96 | if gotfile and gotmd5: 97 | gotit = md5_check(filename, md5filename) 98 | if not gotit: 99 | print_verbose('Verified download: attempt #%d failed, md5 does not match. Redownloading.' % (ntries+1)) 100 | elif gotfile and not gotmd5: 101 | print_verbose('File downloaded but no md5 sum. Unverified. This can be normal.') 102 | gotit = True 103 | else: 104 | print_verbose('File not found, it may not exist on the server.') 105 | gotit = False 106 | giveup = True 107 | ntries += 1 108 | #second time we redownload_anyway 109 | force = True 110 | return gotit 111 | 112 | 113 | def web_download_file(url, session, output_dir, maxtries, force, resume=True): 114 | """Given a session, downloads the file into the directory. Creates the directory if it does not exists. 115 | if force, downloads also if it does not exist""" 116 | 117 | filename = os.path.basename(urlparse(url).path) 118 | filename = os.path.abspath(os.path.join(output_dir, filename)) 119 | print_debug('File to download: %s' % (filename)) 120 | # Make dir to output files to if it 121 | if not os.path.exists(output_dir): 122 | os.makedirs(output_dir) 123 | 124 | url_print = os.path.basename(url) 125 | # Redownload file, if problem occurs with network 126 | ntries = 0 127 | if os.path.isfile(filename) and (not force) and (not resume): 128 | print_verbose('File %s exists.' % (filename)) 129 | return(True) 130 | else: 131 | while ntries < maxtries: 132 | print_debug('Try #%d' % (ntries + 1)) 133 | resume_header = None 134 | already_have = 0 135 | if resume: 136 | file_open_mode='ab' 137 | try: 138 | already_have = os.path.getsize(filename) 139 | print_verbose("Already have: %d bytes of the file, trying to resume." % already_have) 140 | except: 141 | print_verbose("No partial file to resume.") 142 | resume_header = {'Range': 'bytes=%d-' % already_have} 143 | else: 144 | file_open_mode='wb' 145 | try: 146 | r = session.get(url, stream=True, timeout=30, headers=resume_header) 147 | print_debug('Status code: %s' % r.status_code) 148 | if r.status_code in set([200, 206]): 149 | with open(filename, file_open_mode) as out: 150 | if( 'content-length' in (r.headers) ): 151 | dl_total_length = int(r.headers.get('content-length')) + already_have 152 | print_debug('Total length: %s' % (str(dl_total_length))) 153 | dl_size=already_have 154 | dl_start_chunk = datetime.datetime.now() 155 | 156 | sys.stdout.write("\r ") 157 | sys.stdout.flush() 158 | for chunk in r.iter_content(chunk_size=(1024*1024)): 159 | out.write(chunk) 160 | dl_end_chunk = datetime.datetime.now() 161 | dl_size += len(chunk) 162 | #if dl_start_chunk != 0 and 'content-length' in (r.headers): 163 | if 'content-length' in (r.headers): 164 | # dl_done = int(100 * dl_size / dl_total_length) 165 | dl_done = float(dl_size) / dl_total_length 166 | dl_dtdelta = ( dl_end_chunk - dl_start_chunk ).microseconds 167 | # sys.stdout.write("\r%s %s" % (dl_done, str( 1024 * 60 / dl_dtdelta) )) 168 | # sys.stdout.write("\r{0:.2%} {1}".format(dl_done, str( 1024 * 60 / dl_dtdelta) )) 169 | sys.stdout.write("\r{0} Progress: {1:.2%}".format(url_print, dl_done)) 170 | sys.stdout.flush() 171 | dl_start_chunk = datetime.datetime.now() 172 | # sys.stdout.write("\rFile has been downloaded successfully".format(1)) 173 | 174 | # Clears line 175 | sys.stdout.write("\r{0} [OK] ".format(url_print)) 176 | sys.stdout.flush() 177 | # print "File has been downloaded successfully." 178 | elif r.status_code == 416: 179 | print("File of correct size already there") 180 | elif r.status_code == 401: 181 | print("HTTP %s Unauthorized. Login credentials are wrong." % r.status_code) 182 | return False 183 | elif r.status_code == 404: 184 | print("HTTP %s does not exist." % (url_print)) 185 | ntries = maxtries + 1 186 | return False 187 | else: 188 | sys.stdout.write("\r%s [Failed] Status code: %s \n" % (str(url_print), str(r.status_code))) 189 | sys.stdout.flush() 190 | return False 191 | # print "Error HTTP %s File Not Found" % r.status_code 192 | except requests.exceptions.Timeout or requests.exceptions.ConnectionError: 193 | sys.stdout.write("\rNetwork timed out. Attempting redownload or resume.") 194 | sys.stdout.flush() 195 | time.sleep(4) 196 | ntries += 1 197 | continue 198 | except requests.exceptions.ConnectionError or requests.exceptions.ChunkedEncodingError: 199 | sys.stdout.write("\rNetwork timed out. Attempting redownload or resume..") 200 | sys.stdout.flush() 201 | time.sleep(4) 202 | ntries += 1 203 | continue 204 | except requests.exceptions.ChunkedEncodingError: 205 | sys.stdout.write("\rChunked Encoding Error. Redownloading or resuming") 206 | sys.stdout.flush() 207 | time.sleep(4) 208 | ntries += 1 209 | continue 210 | sys.stdout.write('\n') 211 | sys.stdout.flush() 212 | return(True) 213 | 214 | def webdir_ls(url, session): 215 | """given the session and the URL, return the list of all files in the direcotry 216 | as a list of filenames to be appended to the URL 217 | The URL MUST point to an autoindexed directory (not verified by the function) 218 | An empty list is returned if something goes wrong. 219 | """ 220 | rawdirlist = session.get(url, stream=True, timeout=30) 221 | print_debug("Getting url: %s" % (url)) 222 | print_debug("Result:" + str(rawdirlist)) 223 | if rawdirlist.status_code == 200: 224 | Index_Parser.reset_filelist() 225 | Index_Parser.feed(rawdirlist.text) 226 | return(Index_Parser.FileList) 227 | else: 228 | return([]) 229 | -------------------------------------------------------------------------------- /whoisxmlapi_mysqldump_loaders/load_mysql_data_per_tables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | #Sample script to load ASCII mysql dumps for a tld 4 | #This loads the schema first, then load each table’s data separately. 5 | #Recommened for a large database such as .com 6 | #Copyright (c) 2010-2021 Whois API LLC, http://www.whoisxmlapi.com 7 | # 8 | #Note: IF YOU ARE READING THIS SCRIPT JUST TO COLLECT IDEAS FOR YOUR OWN LOADER, 9 | # VISIT THE END OF THE FILE WHERE THE REAL WORK IS DONE 10 | # 11 | # Global variables. 12 | # 13 | LANG=C 14 | LC_ALL=C 15 | VERSION="0.0.3" 16 | VERBOSE="no" 17 | DEBUG="no" 18 | SHOWPROGRESS="no" 19 | MYNAME=$(basename $0) 20 | CATCOMMAND="cat" 21 | 22 | #No mysql stuff by default. This is set by mandatory args. 23 | unset MYSQL_USER 24 | unset MYSQL_PASSWORD 25 | unset MYSQL_DATABASE 26 | 27 | #Importing generic utilities 28 | 29 | source load_mysql_utils.sh 30 | 31 | function printHelpAndExit() 32 | { 33 | echo "Usage: $MYNAME [OPTION]..." 34 | echo "$MYNAME -- loads data for a given tld" 35 | echo "from a schema file and separate table files " 36 | echo " into a table in a mysql database." 37 | echo "" 38 | echo " -h, --help Print this help and exit." 39 | echo " -v, --version Print version information and exit." 40 | echo " --verbose Print more messages." 41 | echo " --show-progress Display progress bars when loading data from dumps." 42 | echo " Recommended, especially for large domains." 43 | echo " --mysql-user=USERNAME User name to login to the mysql database (optional)." 44 | echo " --mysql-password=PASSWORD Password to login to the data source (optional)." 45 | echo " --mysql-database=DATABASE The name of the mysql database to load data into. " 46 | echo " This database is created by the script by default, " 47 | echo " so should not exist, or use --no-create-db" 48 | echo " --schema-file=SCHEMAFILE The schema file to be used when loading. " 49 | echo " IMPORTANT: should be schema only, should not contain data." 50 | echo " Not to be used with --tld" 51 | echo " --schema-only If specified, the table files are not loaded." 52 | echo " --data-only If specified, only tables data are loaded into an existing database " 53 | echo " with already loaded schema" 54 | echo " --no-create-db Does not create the database newly, supposes that it exists already" 55 | echo " --table-files-directory=TABLEFILESDIR The directory where the dumps of all tables reside." 56 | echo " This contains the files with the actual data." 57 | echo " Not to be used with --tld" 58 | echo " --tld=TLD load data for this tld" 59 | echo " --schema-files-dir=DIRECTORY The schema files for the tld-s are in this directory. The table files have to be in its subdirectory named 'tables'. Only for --tld" 60 | echo " --db-version=STRING The version to load download. Required for --tld Format: vNN, e.g. v19" 61 | echo "" 62 | echo "Examples:" 63 | echo "" 64 | echo " -loading sample data downloaded into a directory mysqldump_sample from " 65 | echo " http://domainwhoisdatabase.com/whois_database/sample/gtlds/v20/mysqldump_sample/aaa" 66 | echo "" 67 | echo "$MYNAME --mysql-database=sample_db_aaa --mysql-user=whoisuser --mysql-password=whoispassword --schema-files-dir=mysqldump_sample --db-version=v20 --tld=aaa --verbose --show-progress" 68 | echo "" 69 | echo " or the same task quietly, specifying the file names and paths directly:" 70 | echo "" 71 | echo "$MYNAME --schema-file=mysqldump_sample/aaa/whoiscrawler_v20_aaa_mysql_schema.sql.gz --table-files-directory=mysqldump_sample/aaa/tables --mysql-database=sample_db_aaa --mysql-user=whoisuser --mysql-password=whoispassword" 72 | echo "" 73 | echo " -loading production data downloaded into a directory database_dump/mysqldump/aaa from" 74 | echo " http://www.domainwhoisdatabase.com/whois_database/v20/database_dump/mysqldump/aaa" 75 | echo "" 76 | echo "$MYNAME --mysql-database=production_db_aaa --mysql-user=whoisuser --mysql-password=whoispassword --schema-files-dir=database_dump/mysqldump --tld=aaa --db-version=v20 --verbose --show-progress" 77 | echo "" 78 | echo " or the same task specifying the file names and paths directly:" 79 | echo "" 80 | echo "$MYNAME --schema-file=database_dump/mysqldump/aaa/whoiscrawler_v20_aaa_mysql_schema.sql.gz --table-files-directory=database_dump/mysqldump/aaa/tables --mysql-database=prod_db_aaa --mysql-user=whoisuser --mysql-password=whoispassword --verbose --show-progress" 81 | exit 1 82 | } 83 | 84 | ARGS=$(\ 85 | getopt -o hv \ 86 | -l "help,verbose,debug,show-progress,version,v,mysql-database:,mysql-user:,mysql-password:,table-files-directory:,\ 87 | schema-file:,schema-only,data-only,no-create-db,tld:,schema-files-dir:,db-version:" \ 88 | -- "$@") 89 | 90 | 91 | if [ $? -ne 0 ]; then 92 | exit 6 93 | fi 94 | 95 | eval set -- "$ARGS" 96 | 97 | while true; do 98 | case "$1" in 99 | -h|--help) 100 | shift 101 | printHelpAndExit 102 | ;; 103 | 104 | --verbose) 105 | shift 106 | VERBOSE="true" 107 | ;; 108 | 109 | --debug) 110 | shift 111 | DEBUG="yes" 112 | VERBOSEARG="--verbose" 113 | ;; 114 | 115 | --show-progress) 116 | shift 117 | if which pv > /dev/null;then 118 | CATCOMMAND="pv" 119 | else 120 | printError "The show-progress argument needs pv to be installed (e.g. apt-get install pv)" 121 | exit 1 122 | fi 123 | ;; 124 | 125 | -v|--version) 126 | shift 127 | printVersionAndExit 128 | ;; 129 | 130 | --mysql-user) 131 | shift 132 | db_username=$1 133 | shift 134 | ;; 135 | 136 | --mysql-password) 137 | shift 138 | export MYSQL_PWD=$1 139 | shift 140 | ;; 141 | 142 | --mysql-database) 143 | shift 144 | db=$1 145 | shift 146 | ;; 147 | 148 | --schema-only) 149 | shift 150 | SCHEMAONLY="True" 151 | ;; 152 | 153 | --data-only) 154 | shift 155 | DONTCREATEDB="True" 156 | DATAONLY="True" 157 | ;; 158 | 159 | --no-create-db) 160 | shift 161 | DONTCREATEDB="True" 162 | ;; 163 | 164 | --table-files-directory) 165 | shift 166 | table_files_dir=$1 167 | if ! [ -d "$table_files_dir" ]; then 168 | printError "The directory $schema_file in which the table files should reside is not found." 169 | exit 1 170 | fi 171 | shift 172 | ;; 173 | 174 | --schema-file) 175 | shift 176 | schema_file=$(readlink -e "$1") 177 | if ! [ -f "$schema_file" ]; then 178 | printError "The specified schema file $schema_file is not found." 179 | exit 1 180 | fi 181 | #IF we have zgrep, we verify if the schema file is really schema-only. 182 | if [ -x $(which zgrep) ]; then 183 | if zgrep -q "Dumping data for table" $schema_file; then 184 | printError "The specified schema file $schema_file contains actual data." 185 | printError "Please specify a schema-only file" 186 | exit 1 187 | fi 188 | fi 189 | shift 190 | ;; 191 | 192 | --tld) 193 | shift 194 | TLD=$1 195 | shift 196 | ;; 197 | 198 | --schema-files-dir) 199 | shift 200 | SCHEMA_FILES_DIR=$1 201 | if ! [ -d "$SCHEMA_FILES_DIR" ]; then 202 | printError "The specified dump file directory does not exist." 203 | exit 1 204 | fi 205 | shift 206 | ;; 207 | 208 | --db-version) 209 | shift 210 | #format check 211 | if echo $1 | grep --quiet -e "v[0-9]*"; then 212 | DATABASE_VERSION=$1 213 | else 214 | printError "Invalid db-version specification. It should be like v19 or v6" 215 | exit 1 216 | fi 217 | shift 218 | ;; 219 | 220 | --) 221 | shift 222 | break 223 | ;; 224 | 225 | *) 226 | ;; 227 | esac 228 | done 229 | 230 | #some verification before doing the real job 231 | 232 | #Set up mysql login credentials if needed 233 | if [ -n "$db_username" ]; then 234 | MYSQL_ARGUMENTS="--user=$db_username" 235 | fi; 236 | 237 | printDebug "Mysql arguments: $MYSQL_ARGUMENTS" 238 | printDebug "Mysql Password: $MYSQL_PWD" 239 | 240 | if [ -n "$table_files_dir" -o -n "$schema_file" ] && [ -n "$TLD" -o -n "$SCHEMA_FILES_DIR" -o -n "$DATABASE_VERSION" ]; then 241 | printError "Conflicting arguments. Please use either --table-files-directory + --schema-file or --tld + --schema-files-dir + --db-version." 242 | exit 1 243 | fi 244 | 245 | if [ -z "$db" ]; then 246 | printError "Mysql database not specified. See $MYNAME --help" 247 | exit 1 248 | fi 249 | 250 | #If the tld is specified, we find out the schemafile name and the tables dir. 251 | if [ -z "$schema_file" ]; then 252 | schema_file="$SCHEMA_FILES_DIR"/"$TLD"/whoiscrawler_"$TLD"_mysql_schema.sql.gz 253 | if [ ! -f "$schema_file" ]; then 254 | schema_file="$SCHEMA_FILES_DIR"/"$TLD"/whoiscrawler_"$DATABASE_VERSION"_"$TLD"_mysql_schema.sql.gz 255 | fi 256 | #Quarterly feeds case 257 | if [ ! -f "$schema_file" ]; then 258 | TLDUNDERSCORE=$(echo "$TLD" | sed -e "s/\./_/g") 259 | schema_file="$SCHEMA_FILES_DIR"/"$TLD"/domains_whoiscrawler_"$DATABASE_VERSION"_"$TLDUNDERSCORE"_mysql_schema.sql.gz 260 | fi 261 | if [ -z "$SCHEMAONLY" ]; then 262 | table_files_dir="$SCHEMA_FILES_DIR"/"$TLD"/tables 263 | fi 264 | fi 265 | 266 | printVerbose "Schema file: $schema_file" 267 | printVerbose "Tables dir: $table_files_dir" 268 | 269 | if [ ! -f "$schema_file" ]; then 270 | printError "Schema file not specified or does not exist. See $MYNAME --help" 271 | exit 1 272 | fi 273 | if [ -z $SCHEMAONLY ] && [ ! -d "$table_files_dir" ]; then 274 | printError "The directory in which the table files should reside is not specified or does not exist." 275 | printError "See $MYNAME --help" 276 | exit 1 277 | fi 278 | 279 | #THE REAL WORK STARTS HERE 280 | if [ -z "$DONTCREATEDB" ]; then 281 | printVerbose "Creating database $db" 282 | mysql ${MYSQL_ARGUMENTS} ${VERBOSEARG} -e "create database $db" 283 | printVerbose "Loading mysql schema" 284 | else 285 | printVerbose "Not creating database, --no-create-db was specified." 286 | fi 287 | 288 | if [ -z "$DATAONLY" ]; then 289 | if [ ${schema_file: -3} == ".gz" ]; then 290 | 291 | $CATCOMMAND $schema_file | gunzip -c | mysql ${MYSQL_ARGUMENTS} ${VERBOSEARG} $db 292 | else 293 | $CATCOMMAND $schema_file mysql ${MYSQL_ARGUMENTS} ${VERBOSEARG} $db 294 | fi 295 | printVerbose "Mysql schema loaded." 296 | fi 297 | 298 | if [ -n "$SCHEMAONLY" ]; then 299 | printVerbose "Schema-only loading, so we are ready." 300 | exit 0 301 | fi 302 | 303 | tables="whois_record registry_data contact domain_names_whoisdatacollector" 304 | 305 | printVerbose "Trying to drop some unnecessary indices to load faster." 306 | printVerbose " They may not exists so mysql errors are normal here" 307 | mysql ${MYSQL_ARGUMENTS} ${VERBOSEARG} $db -e "alter table whois_record drop index domain_name_index;alter table whois_record drop index domain_name;" >/dev/null 2>&1 308 | mysql ${MYSQL_ARGUMENTS} ${VERBOSEARG} $db -e "alter table registry_data drop index domain_name_index;alter table registry_data drop index domain_name;">/dev/null 2>&1 309 | printVerbose "Unnecessary indices, if any, have been dropped." 310 | table_files_dir=$table_files_dir/*.sql.gz 311 | 312 | printVerbose "Loading data from table files" 313 | for file in $table_files_dir; do 314 | 315 | time=`date +%s` 316 | if [ -f "$file" ]; then 317 | time=`date +%s` 318 | 319 | printVerbose "loading data from file $file" 320 | #No verbose mysql here as the echoed output can be huge. 321 | if [ ${file: -3} == ".gz" ]; then 322 | { echo "SET autocommit = 0;" 323 | $CATCOMMAND "$file" | gunzip -c 324 | echo "commit;" ; } | mysql ${MYSQL_ARGUMENTS} --force $db 325 | elif [ ${file: -4} == ".sql" ]; then 326 | { echo "SET autocommit = 0;" 327 | $CATCOMMAND "$file" 328 | echo "commit;" ; } | mysql ${MYSQL_ARGUMENTS} --force $db 329 | fi 330 | 331 | fi 332 | 333 | time2=`date +%s` 334 | dur=`expr $time2 - $time` 335 | printVerbose " loading $table from file $file took $dur seconds" 336 | 337 | done 338 | printVerbose "Creating new indices" 339 | time=`date +%s` 340 | mysql ${MYSQL_ARGUMENTS} ${VERBOSEARG} $db -e "alter table whois_record add index domain_name_index(domain_name)" 341 | mysql ${MYSQL_ARGUMENTS} ${VERBOSEARG} $db -e "alter table registry_data add index domain_name_index(domain_name)" 342 | time2=`date +%s` 343 | dur=`expr $time2 - $time` 344 | printVerbose " adding indices took $dur seconds." 345 | 346 | --------------------------------------------------------------------------------