├── .Rbuildignore ├── NEWS.md ├── tests ├── testthat.R └── testthat │ └── test-tldextract.R ├── data └── tldnames.rda ├── NAMESPACE ├── tldextract.Rproj ├── man ├── tldnames.Rd ├── tldextract.Rd └── getTLD.Rd ├── DESCRIPTION ├── .gitignore ├── README.Rmd ├── README.md └── R └── tldextract.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # tldextract 1.0 2 | 3 | * Initial release -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | test_check("tldextract") -------------------------------------------------------------------------------- /data/tldnames.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jayjacobs/tldextract/HEAD/data/tldnames.rda -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2 (4.1.1): do not edit by hand 2 | 3 | export(getTLD) 4 | export(tldextract) 5 | import(data.table) 6 | importFrom(httr,GET) 7 | importFrom(httr,content) 8 | importFrom(httr,stop_for_status) 9 | importFrom(httr,user_agent) 10 | -------------------------------------------------------------------------------- /tldextract.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | PackageRoxygenize: rd,collate,namespace,vignette 19 | -------------------------------------------------------------------------------- /man/tldnames.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.1): do not edit by hand 2 | \docType{data} 3 | \name{tldnames} 4 | \alias{tldnames} 5 | \title{List of Top-Level Domains Names} 6 | \format{A vector containing the top level domains} 7 | \description{ 8 | A dataset containing a single vector of the top-level domain names as retrevied from 9 | \url{https://publicsuffix.org/list/effective_tld_names.dat} on August 2, 2014. 10 | } 11 | \details{ 12 | This can manually refreshed upon library load with the \code{\link{getTLD}} function. 13 | 14 | Note, the non-ASCII characters may create a problem, and needs more testing. 15 | } 16 | \keyword{datasets} 17 | 18 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: tldextract 2 | Type: Package 3 | Title: Extract top level domain, domain, and subdomain from host name 4 | Version: 1.0 5 | Date: 2014-08-02 6 | Author: Jay Jacobs 7 | Maintainer: Jay Jacobs 8 | Description: After working with 9 | in python, I wanted the same functionality within R. The list of top level 10 | domains are loaded from 11 | . The data is 12 | stored in the package and may need to updated with the getTLD() function. 13 | License: MIT 14 | Depends: 15 | data.table, 16 | httr 17 | Suggests: 18 | testthat 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .AppleDouble 3 | .LSOverride 4 | Icon 5 | .RProj.user 6 | *.RProj 7 | .Rhistory 8 | .RData 9 | suda.Rcheck 10 | rcode/ch4 11 | .RDataTmp 12 | chapters/ch09/data/vast/* 13 | 14 | <<<<<<< HEAD 15 | .dropbox 16 | .Dropbox 17 | 18 | Icon 19 | 20 | ======= 21 | >>>>>>> 48c5aa08dcc5b421a341daefa6ea949090ecf315 22 | # Thumbnails 23 | ._* 24 | 25 | # Files that might appear on external disk 26 | .Spotlight-V100 27 | .Trashes 28 | rcode/.Rhistory 29 | <<<<<<< HEAD 30 | # History files 31 | .Rhistory 32 | 33 | # Example code in package build process 34 | *-Ex.R 35 | # Windows image file caches 36 | Thumbs.db 37 | ehthumbs.db 38 | 39 | # Folder config file 40 | Desktop.ini 41 | 42 | # Recycle Bin used on file shares 43 | $RECYCLE.BIN/ 44 | ======= 45 | .Rproj.user 46 | >>>>>>> b0e3616219735a95f64240a81c62881f41b8989a 47 | 48 | *.ipynb 49 | 50 | chapters/ch05/ch05.zip 51 | -------------------------------------------------------------------------------- /man/tldextract.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.0.1): do not edit by hand 2 | \name{tldextract} 3 | \alias{tldextract} 4 | \title{Extract the top level domain, domain and subdomain from a host name} 5 | \usage{ 6 | tldextract(host, tldnames = NULL) 7 | } 8 | \arguments{ 9 | \item{host}{vector of one or more host names} 10 | 11 | \item{tldnames}{vector of TLD names (see \code{\link{getTLD}})} 12 | } 13 | \description{ 14 | Given one or more host names, this will return a data frame with four column: 15 | \itemize{ 16 | \item "host": the original host name 17 | \item "tld": the top level domain extracted 18 | \item "domain" the domain extracted 19 | \item "subdomain" one or more subdomains prepending the domain 20 | } 21 | } 22 | \details{ 23 | If a hostname is not understandable (no top level domain is matched), it will 24 | return NA for the three components. 25 | } 26 | \examples{ 27 | \dontrun{ 28 | hosts <- tldextract(c("www.google.com", "testing.co.uk"), tldnames=getTLD()) 29 | } 30 | } 31 | \seealso{ 32 | getTLD 33 | } 34 | 35 | -------------------------------------------------------------------------------- /tests/testthat/test-tldextract.R: -------------------------------------------------------------------------------- 1 | context("TLD Extract") 2 | 3 | test_that("checks simple hostname", { 4 | expect_identical( 5 | tldextract("www.google.com"), 6 | data.frame(host="www.google.com", subdomain="www", 7 | domain="google", tld="com", stringsAsFactors=F)) 8 | 9 | }) 10 | test_that("checks multiple hostname", { 11 | hosts <- c("www.tinong.net", "mdotm.co", "mx.mail.softbank.ne.jp", 12 | "mundopositivo.com.br", "www.pianomedia.eu", 13 | "pages.cinergroup.com.tr", "baixarfilmesdublados.net") 14 | subdomain <- c("www", NA, "mx.mail", NA, "www", "pages", NA) 15 | domain <- c("tinong", "mdotm", "softbank", "mundopositivo", 16 | "pianomedia", "cinergroup", "baixarfilmesdublados") 17 | tld <- c("net", "co", "ne.jp", "com.br", "eu", "com.tr", "net") 18 | hostexpect <- data.frame(host=hosts, subdomain=subdomain, 19 | domain=domain, tld=tld, stringsAsFactors=F) 20 | expect_identical(tldextract(hosts), hostexpect) 21 | }) -------------------------------------------------------------------------------- /man/getTLD.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/tldextract.R 3 | \name{getTLD} 4 | \alias{getTLD} 5 | \title{Retrieve a current list of the top level domains} 6 | \usage{ 7 | getTLD(url = "https://publicsuffix.org/list/effective_tld_names.dat") 8 | } 9 | \arguments{ 10 | \item{url}{URL of the location for the tld name authority. Set to point to publicsuffix.org 11 | by default; while you can change this, much of \code{getTLD}'s internal parsing may be 12 | specific to publicsuffix.org's format, and so other URLs may actively break or produce 13 | oddly formated results} 14 | } 15 | \description{ 16 | This function will reach out to \url{https://publicsuffix.org/list/effective_tld_names.dat}, 17 | retrieve the contents and create a simple vector of all the top level domains (removing 18 | comments and blanks lines from the file). 19 | 20 | If there is no network connectivity, a cached version of the data ("tldnames") is included with 21 | this package and can be loaded with \code{data("tldnames")} after loading this package. 22 | } 23 | \examples{ 24 | \dontrun{ 25 | tldnames <- getTLD() 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "README" 3 | output: md_document 4 | --- 5 | 6 | After working with in python, I wanted the same functionality within R. The list of top level domains can be automatically loaded from . A cached version of the data is stored in the package. 7 | 8 | ### Installation 9 | 10 | To install this package, use the devtools package: 11 | ```{r eval=FALSE} 12 | devtools::install_github("jayjacobs/tldextract") 13 | ``` 14 | 15 | ### Usage 16 | 17 | ```{r} 18 | library(tldextract) 19 | # use the cached lookup data, simple call 20 | tldextract("www.google.com") 21 | 22 | # it can take multiple domains at the same time 23 | tldextract(c("www.google.com", "www.google.com.ar", "googlemaps.ca", "tbn0.google.cn")) 24 | ``` 25 | 26 | The specification for the top-level domains is cached in the package and is viewable. 27 | 28 | ```{r} 29 | # view and update the TLD domains list in the tldnames data 30 | data(tldnames) 31 | head(tldnames) 32 | ``` 33 | 34 | If the cached version is out of data and the package isn't updated, the data can be manually loaded, and then passed into the \code{tldextract} function. 35 | 36 | ```{r} 37 | # get most recent TLD listings 38 | tld <- getTLD() # optionally pass in a different URL than the default 39 | manyhosts <- c("pages.parts.marionautomotive.com", "www.embroiderypassion.com", 40 | "fsbusiness.co.uk", "www.vmm.adv.br", "ttfc.cn", "carole.co.il", 41 | "visiontravail.qc.ca", "mail.space-hoppers.co.uk", "chilton.k12.pa.us") 42 | tldextract(manyhosts, tldnames=tld) 43 | ``` 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Important: This is not active see [urltools](https://cran.r-project.org/web/packages/urltools/index.html) 2 | 3 | This package is no longer active. All of the functionality has been integrated into the `urltools` package: 4 | https://cran.r-project.org/web/packages/urltools/index.html 5 | 6 | 7 | # Original docs on this unmaintained version: 8 | 9 | After working with in 10 | python, I wanted the same functionality within R. The list of top level 11 | domains can be automatically loaded from 12 | . A cached 13 | version of the data is stored in the package. 14 | 15 | ### Installation 16 | 17 | To install this package, use the devtools package: 18 | 19 | devtools::install_github("jayjacobs/tldextract") 20 | 21 | ### Usage 22 | 23 | library(tldextract) 24 | 25 | ## Loading required package: data.table 26 | ## Loading required package: httr 27 | 28 | # use the cached lookup data, simple call 29 | tldextract("www.google.com") 30 | 31 | ## host subdomain domain tld 32 | ## 1 www.google.com www google com 33 | 34 | # it can take multiple domains at the same time 35 | tldextract(c("www.google.com", "www.google.com.ar", "googlemaps.ca", "tbn0.google.cn")) 36 | 37 | ## host subdomain domain tld 38 | ## 1 www.google.com www google com 39 | ## 2 www.google.com.ar www google com.ar 40 | ## 3 googlemaps.ca googlemaps ca 41 | ## 4 tbn0.google.cn tbn0 google cn 42 | 43 | The specification for the top-level domains is cached in the package and 44 | is viewable. 45 | 46 | # view and update the TLD domains list in the tldnames data 47 | data(tldnames) 48 | head(tldnames) 49 | 50 | ## [1] "ac" "com.ac" "edu.ac" "gov.ac" "net.ac" "mil.ac" 51 | 52 | If the cached version is out of data and the package isn't updated, the 53 | data can be manually loaded, and then passed into the function. 54 | 55 | # get most recent TLD listings 56 | tld <- getTLD() # optionally pass in a different URL than the default 57 | manyhosts <- c("pages.parts.marionautomotive.com", "www.embroiderypassion.com", 58 | "fsbusiness.co.uk", "www.vmm.adv.br", "ttfc.cn", "carole.co.il", 59 | "visiontravail.qc.ca", "mail.space-hoppers.co.uk", "chilton.k12.pa.us") 60 | tldextract(manyhosts, tldnames=tld) 61 | 62 | ## host subdomain domain tld 63 | ## 1 pages.parts.marionautomotive.com pages.parts marionautomotive com 64 | ## 2 www.embroiderypassion.com www embroiderypassion com 65 | ## 3 fsbusiness.co.uk fsbusiness co.uk 66 | ## 4 www.vmm.adv.br www vmm adv.br 67 | ## 5 ttfc.cn ttfc cn 68 | ## 6 carole.co.il carole co.il 69 | ## 7 visiontravail.qc.ca visiontravail qc.ca 70 | ## 8 mail.space-hoppers.co.uk mail space-hoppers co.uk 71 | ## 9 chilton.k12.pa.us chilton k12.pa.us 72 | -------------------------------------------------------------------------------- /R/tldextract.R: -------------------------------------------------------------------------------- 1 | #' @title Retrieve a current list of the top level domains 2 | #' 3 | #' @description This function will reach out to \url{https://publicsuffix.org/list/effective_tld_names.dat}, 4 | #' retrieve the contents and create a simple vector of all the top level domains (removing 5 | #' comments and blanks lines from the file). 6 | #' 7 | #' If there is no network connectivity, a cached version of the data ("tldnames") is included with 8 | #' this package and can be loaded with \code{data("tldnames")} after loading this package. 9 | #' 10 | #' @param url URL of the location for the tld name authority. Set to point to publicsuffix.org 11 | #' by default; while you can change this, much of \code{getTLD}'s internal parsing may be 12 | #' specific to publicsuffix.org's format, and so other URLs may actively break or produce 13 | #' oddly formated results 14 | #' 15 | #' @importFrom httr GET content user_agent stop_for_status 16 | #' @export 17 | #' @examples 18 | #' \dontrun{ 19 | #' tldnames <- getTLD() 20 | #' } 21 | getTLD <- function(url = "https://publicsuffix.org/list/effective_tld_names.dat") { 22 | raw_results <- GET(url, user_agent("tldextract - https://github.com/jayjacobs/tldextract")) 23 | stop_for_status(raw_results) 24 | parsed_results <- unlist(strsplit(content(raw_results, as = "text"),"\n+")) 25 | tldnames <- parsed_results[grep(pattern = "^//", x = parsed_results, invert = TRUE)] 26 | tldnames <- iconv(tldnames, to = "UTF-8") 27 | return(tldnames) 28 | } 29 | 30 | #' Extract the top level domain, domain and subdomain from a host name 31 | #' 32 | #' Given one or more host names, this will return a data frame with four column: 33 | #' \itemize{ 34 | #' \item "host": the original host name 35 | #' \item "tld": the top level domain extracted 36 | #' \item "domain" the domain extracted 37 | #' \item "subdomain" one or more subdomains prepending the domain 38 | #' } 39 | #' 40 | #' If a hostname is not understandable (no top level domain is matched), it will 41 | #' return NA for the three components. 42 | #' 43 | #' @param host vector of one or more host names 44 | #' @param tldnames vector of TLD names (see \code{\link{getTLD}}) 45 | #' @import data.table 46 | #' @export 47 | #' @seealso getTLD 48 | #' @examples 49 | #' \dontrun{ 50 | #' hosts <- tldextract(c("www.google.com", "testing.co.uk"), tldnames=getTLD()) 51 | #' } 52 | tldextract <- function(host, tldnames=NULL) { 53 | if (missing(tldnames)) { 54 | data("tldnames", envir = environment()) 55 | } 56 | wilds <- grepl('^\\*', tldnames) 57 | wildcard <- sub('\\*\\.', "", tldnames[wilds]) 58 | static <- tldnames[!wilds] 59 | 60 | subdomain <- domain <- tld <- rep(NA_character_, length(host)) 61 | splithosts <- strsplit(tolower(host), "[.]") 62 | names(splithosts) <- seq(length(splithosts)) 63 | maxlen <- max(sapply(splithosts, length)) 64 | for(split.after in seq(1, maxlen-1)) { 65 | templ <- sapply(splithosts, function(x) 66 | paste0(x[(split.after+1):length(x)], collapse=".") 67 | ) 68 | matched <- templ %in% static 69 | if (any(matched)) { 70 | index <- as.numeric(names(splithosts)[matched]) 71 | if (split.after>1) { 72 | subdomain[index] <- sapply(splithosts[matched], function(x) paste(x[1:(split.after-1)], collapse=".")) 73 | } 74 | domain[index] <- sapply(splithosts[matched], function(x) unlist(x[split.after])) 75 | tld[index] <- sapply(splithosts[matched], function(x) paste(x[(split.after+1):length(x)], collapse=".")) 76 | } 77 | # now the wildcard 78 | matched2 <- templ %in% wildcard 79 | if (any(matched2) && split.after > 1) { 80 | safter <- split.after - 1 81 | index <- as.numeric(names(splithosts)[matched2]) 82 | if (safter>1) { 83 | subdomain[index] <- sapply(splithosts[matched2], function(x) paste(x[1:(safter-1)], collapse=".")) 84 | } 85 | domain[index] <- sapply(splithosts[matched2], function(x) x[safter]) 86 | tld[index] <- sapply(splithosts[matched2], function(x) paste(x[(safter+1):length(x)], collapse=".")) 87 | } 88 | if (any(matched2 | matched)) { 89 | splithosts <- splithosts[!(matched | matched2)] 90 | if(length(splithosts)<1) break 91 | } 92 | } 93 | data.frame(host=host, subdomain=subdomain, domain=domain, tld=tld, stringsAsFactors=F) 94 | } 95 | 96 | #' List of Top-Level Domain Names 97 | #' 98 | #' A dataset containing a single vector of the top-level domain names as retrevied from 99 | #' \url{https://publicsuffix.org/list/effective_tld_names.dat} on August 2, 2014. 100 | #' 101 | #' This can manually refreshed upon library load with the \code{\link{getTLD}} function. 102 | #' 103 | #' Note, the non-ASCII characters may create a problem, and needs more testing. 104 | #' 105 | #' @docType data 106 | #' @keywords datasets 107 | #' @format A vector containing the top level domains 108 | #' @name tldnames 109 | NULL 110 | --------------------------------------------------------------------------------