├── .Rbuildignore
├── NEWS.md
├── tests
    ├── testthat.R
    └── testthat
    │   └── test-tldextract.R
├── data
    └── tldnames.rda
├── NAMESPACE
├── tldextract.Rproj
├── man
    ├── tldnames.Rd
    ├── tldextract.Rd
    └── getTLD.Rd
├── DESCRIPTION
├── .gitignore
├── README.Rmd
├── README.md
└── R
    └── tldextract.R


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # tldextract 1.0
2 | 
3 | * Initial release


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | test_check("tldextract")


--------------------------------------------------------------------------------
/data/tldnames.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jayjacobs/tldextract/HEAD/data/tldnames.rda


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2 (4.1.1): do not edit by hand
 2 | 
 3 | export(getTLD)
 4 | export(tldextract)
 5 | import(data.table)
 6 | importFrom(httr,GET)
 7 | importFrom(httr,content)
 8 | importFrom(httr,stop_for_status)
 9 | importFrom(httr,user_agent)
10 | 


--------------------------------------------------------------------------------
/tldextract.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | PackageRoxygenize: rd,collate,namespace,vignette
19 | 


--------------------------------------------------------------------------------
/man/tldnames.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \docType{data}
 3 | \name{tldnames}
 4 | \alias{tldnames}
 5 | \title{List of Top-Level Domains Names}
 6 | \format{A vector containing the top level domains}
 7 | \description{
 8 | A dataset containing a single vector of the top-level domain names as retrevied from
 9 | \url{https://publicsuffix.org/list/effective_tld_names.dat} on August 2, 2014.
10 | }
11 | \details{
12 | This can manually refreshed upon library load with the \code{\link{getTLD}} function.
13 | 
14 | Note, the non-ASCII characters may create a problem, and needs more testing.
15 | }
16 | \keyword{datasets}
17 | 
18 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: tldextract
 2 | Type: Package
 3 | Title: Extract top level domain, domain, and subdomain from host name
 4 | Version: 1.0
 5 | Date: 2014-08-02
 6 | Author: Jay Jacobs <jay.jacobs@verizon.com>
 7 | Maintainer: Jay Jacobs <jay.jacobs@verizon.com>
 8 | Description: After working with <https://github.com/john-kurkowski/tldextract>
 9 |     in python, I wanted the same functionality within R.  The list of top level
10 |     domains are loaded from
11 |     <https://publicsuffix.org/list/effective_tld_names.dat>.  The data is
12 |     stored in the package and may need to updated with the getTLD() function.
13 | License: MIT
14 | Depends:
15 |     data.table,
16 |     httr
17 | Suggests:
18 |     testthat
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .AppleDouble
 3 | .LSOverride
 4 | Icon
 5 | .RProj.user
 6 | *.RProj
 7 | .Rhistory
 8 | .RData
 9 | suda.Rcheck
10 | rcode/ch4
11 | .RDataTmp
12 | chapters/ch09/data/vast/*
13 | 
14 | <<<<<<< HEAD
15 | .dropbox
16 | .Dropbox
17 | 
18 | Icon
19 | 
20 | =======
21 | >>>>>>> 48c5aa08dcc5b421a341daefa6ea949090ecf315
22 | # Thumbnails
23 | ._*
24 | 
25 | # Files that might appear on external disk
26 | .Spotlight-V100
27 | .Trashes
28 | rcode/.Rhistory
29 | <<<<<<< HEAD
30 | # History files
31 | .Rhistory
32 | 
33 | # Example code in package build process
34 | *-Ex.R
35 | # Windows image file caches
36 | Thumbs.db
37 | ehthumbs.db
38 | 
39 | # Folder config file
40 | Desktop.ini
41 | 
42 | # Recycle Bin used on file shares
43 | $RECYCLE.BIN/
44 | =======
45 | .Rproj.user
46 | >>>>>>> b0e3616219735a95f64240a81c62881f41b8989a
47 | 
48 | *.ipynb
49 | 
50 | chapters/ch05/ch05.zip
51 | 


--------------------------------------------------------------------------------
/man/tldextract.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \name{tldextract}
 3 | \alias{tldextract}
 4 | \title{Extract the top level domain, domain and subdomain from a host name}
 5 | \usage{
 6 | tldextract(host, tldnames = NULL)
 7 | }
 8 | \arguments{
 9 | \item{host}{vector of one or more host names}
10 | 
11 | \item{tldnames}{vector of TLD names (see \code{\link{getTLD}})}
12 | }
13 | \description{
14 | Given one or more host names, this will return a data frame with four column:
15 | \itemize{
16 | \item "host": the original host name
17 | \item "tld": the top level domain extracted
18 | \item "domain" the domain extracted
19 | \item "subdomain" one or more subdomains prepending the domain
20 | }
21 | }
22 | \details{
23 | If a hostname is not understandable (no top level domain is matched), it will
24 | return NA for the three components.
25 | }
26 | \examples{
27 | \dontrun{
28 | hosts <- tldextract(c("www.google.com", "testing.co.uk"), tldnames=getTLD())
29 | }
30 | }
31 | \seealso{
32 | getTLD
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/tests/testthat/test-tldextract.R:
--------------------------------------------------------------------------------
 1 | context("TLD Extract")
 2 | 
 3 | test_that("checks simple hostname", {
 4 |   expect_identical(
 5 |     tldextract("www.google.com"),
 6 |     data.frame(host="www.google.com", subdomain="www", 
 7 |                domain="google", tld="com", stringsAsFactors=F))
 8 |   
 9 | })
10 | test_that("checks multiple hostname", {
11 |   hosts <- c("www.tinong.net", "mdotm.co", "mx.mail.softbank.ne.jp", 
12 |              "mundopositivo.com.br", "www.pianomedia.eu", 
13 |              "pages.cinergroup.com.tr", "baixarfilmesdublados.net")
14 |   subdomain <- c("www", NA, "mx.mail", NA, "www", "pages", NA)
15 |   domain <- c("tinong", "mdotm", "softbank", "mundopositivo", 
16 |               "pianomedia", "cinergroup", "baixarfilmesdublados")
17 |   tld <- c("net", "co", "ne.jp", "com.br", "eu", "com.tr", "net")
18 |   hostexpect <- data.frame(host=hosts, subdomain=subdomain,
19 |                             domain=domain, tld=tld, stringsAsFactors=F)
20 |   expect_identical(tldextract(hosts), hostexpect)
21 | })


--------------------------------------------------------------------------------
/man/getTLD.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/tldextract.R
 3 | \name{getTLD}
 4 | \alias{getTLD}
 5 | \title{Retrieve a current list of the top level domains}
 6 | \usage{
 7 | getTLD(url = "https://publicsuffix.org/list/effective_tld_names.dat")
 8 | }
 9 | \arguments{
10 | \item{url}{URL of the location for the tld name authority. Set to point to publicsuffix.org
11 | by default; while you can change this, much of \code{getTLD}'s internal parsing may be
12 | specific to publicsuffix.org's format, and so other URLs may actively break or produce
13 | oddly formated results}
14 | }
15 | \description{
16 | This function will reach out to \url{https://publicsuffix.org/list/effective_tld_names.dat},
17 | retrieve the contents and create a simple vector of all the top level domains (removing
18 | comments and blanks lines from the file).
19 | 
20 | If there is no network connectivity, a cached version of the data ("tldnames") is included with
21 | this package and can be loaded with \code{data("tldnames")} after loading this package.
22 | }
23 | \examples{
24 | \dontrun{
25 | tldnames <- getTLD()
26 | }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "README"
 3 | output: md_document
 4 | ---
 5 | 
 6 | After working with <https://github.com/john-kurkowski/tldextract> in python, I wanted the same functionality within R.  The list of top level domains can be automatically loaded from <https://publicsuffix.org/list/effective_tld_names.dat>.  A cached version of the data is stored in the package.  
 7 | 
 8 | ### Installation
 9 | 
10 | To install this package, use the devtools package:
11 | ```{r eval=FALSE}
12 | devtools::install_github("jayjacobs/tldextract")
13 | ```
14 | 
15 | ### Usage
16 | 
17 | ```{r}
18 | library(tldextract)
19 | # use the cached lookup data, simple call
20 | tldextract("www.google.com")
21 | 
22 | # it can take multiple domains at the same time
23 | tldextract(c("www.google.com", "www.google.com.ar", "googlemaps.ca", "tbn0.google.cn"))
24 | ```
25 | 
26 | The specification for the top-level domains is cached in the package and is viewable.
27 | 
28 | ```{r}
29 | # view and update the TLD domains list in the tldnames data
30 | data(tldnames)
31 | head(tldnames)
32 | ```
33 | 
34 | If the cached version is out of data and the package isn't updated, the data can be manually loaded, and then passed into the \code{tldextract} function.
35 | 
36 | ```{r}
37 | # get most recent TLD listings
38 | tld <- getTLD() # optionally pass in a different URL than the default
39 | manyhosts <- c("pages.parts.marionautomotive.com", "www.embroiderypassion.com", 
40 |                "fsbusiness.co.uk", "www.vmm.adv.br", "ttfc.cn", "carole.co.il",
41 |                "visiontravail.qc.ca", "mail.space-hoppers.co.uk", "chilton.k12.pa.us")
42 | tldextract(manyhosts, tldnames=tld)
43 | ```
44 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Important: This is not active see [urltools](https://cran.r-project.org/web/packages/urltools/index.html)
 2 | 
 3 | This package is no longer active.  All of the functionality has been integrated into the `urltools` package:
 4 | https://cran.r-project.org/web/packages/urltools/index.html
 5 | 
 6 | 
 7 | # Original docs on this unmaintained version:
 8 | 
 9 | After working with <https://github.com/john-kurkowski/tldextract> in
10 | python, I wanted the same functionality within R. The list of top level
11 | domains can be automatically loaded from
12 | <https://publicsuffix.org/list/effective_tld_names.dat>. A cached
13 | version of the data is stored in the package.
14 | 
15 | ### Installation
16 | 
17 | To install this package, use the devtools package:
18 | 
19 |     devtools::install_github("jayjacobs/tldextract")
20 | 
21 | ### Usage
22 | 
23 |     library(tldextract)
24 | 
25 |     ## Loading required package: data.table
26 |     ## Loading required package: httr
27 | 
28 |     # use the cached lookup data, simple call
29 |     tldextract("www.google.com")
30 | 
31 |     ##             host subdomain domain tld
32 |     ## 1 www.google.com       www google com
33 | 
34 |     # it can take multiple domains at the same time
35 |     tldextract(c("www.google.com", "www.google.com.ar", "googlemaps.ca", "tbn0.google.cn"))
36 | 
37 |     ##                host subdomain     domain    tld
38 |     ## 1    www.google.com       www     google    com
39 |     ## 2 www.google.com.ar       www     google com.ar
40 |     ## 3     googlemaps.ca      <NA> googlemaps     ca
41 |     ## 4    tbn0.google.cn      tbn0     google     cn
42 | 
43 | The specification for the top-level domains is cached in the package and
44 | is viewable.
45 | 
46 |     # view and update the TLD domains list in the tldnames data
47 |     data(tldnames)
48 |     head(tldnames)
49 | 
50 |     ## [1] "ac"     "com.ac" "edu.ac" "gov.ac" "net.ac" "mil.ac"
51 | 
52 | If the cached version is out of data and the package isn't updated, the
53 | data can be manually loaded, and then passed into the function.
54 | 
55 |     # get most recent TLD listings
56 |     tld <- getTLD() # optionally pass in a different URL than the default
57 |     manyhosts <- c("pages.parts.marionautomotive.com", "www.embroiderypassion.com", 
58 |                    "fsbusiness.co.uk", "www.vmm.adv.br", "ttfc.cn", "carole.co.il",
59 |                    "visiontravail.qc.ca", "mail.space-hoppers.co.uk", "chilton.k12.pa.us")
60 |     tldextract(manyhosts, tldnames=tld)
61 | 
62 |     ##                               host   subdomain            domain       tld
63 |     ## 1 pages.parts.marionautomotive.com pages.parts  marionautomotive       com
64 |     ## 2        www.embroiderypassion.com         www embroiderypassion       com
65 |     ## 3                 fsbusiness.co.uk        <NA>        fsbusiness     co.uk
66 |     ## 4                   www.vmm.adv.br         www               vmm    adv.br
67 |     ## 5                          ttfc.cn        <NA>              ttfc        cn
68 |     ## 6                     carole.co.il        <NA>            carole     co.il
69 |     ## 7              visiontravail.qc.ca        <NA>     visiontravail     qc.ca
70 |     ## 8         mail.space-hoppers.co.uk        mail     space-hoppers     co.uk
71 |     ## 9                chilton.k12.pa.us        <NA>           chilton k12.pa.us
72 | 


--------------------------------------------------------------------------------
/R/tldextract.R:
--------------------------------------------------------------------------------
  1 | #' @title Retrieve a current list of the top level domains
  2 | #'
  3 | #' @description This function will reach out to \url{https://publicsuffix.org/list/effective_tld_names.dat},
  4 | #' retrieve the contents and create a simple vector of all the top level domains (removing
  5 | #' comments and blanks lines from the file).
  6 | #'
  7 | #' If there is no network connectivity, a cached version of the data ("tldnames") is included with
  8 | #' this package and can be loaded with \code{data("tldnames")} after loading this package.
  9 | #'
 10 | #' @param url URL of the location for the tld name authority. Set to point to publicsuffix.org
 11 | #' by default; while you can change this, much of \code{getTLD}'s internal parsing may be
 12 | #' specific to publicsuffix.org's format, and so other URLs may actively break or produce
 13 | #' oddly formated results
 14 | #' 
 15 | #' @importFrom httr GET content user_agent stop_for_status
 16 | #' @export
 17 | #' @examples
 18 | #' \dontrun{
 19 | #' tldnames <- getTLD()
 20 | #' }
 21 | getTLD <- function(url = "https://publicsuffix.org/list/effective_tld_names.dat") {
 22 |   raw_results <- GET(url, user_agent("tldextract - https://github.com/jayjacobs/tldextract"))
 23 |   stop_for_status(raw_results)
 24 |   parsed_results <- unlist(strsplit(content(raw_results, as = "text"),"\n+"))
 25 |   tldnames <- parsed_results[grep(pattern = "^//", x = parsed_results, invert = TRUE)]
 26 |   tldnames <- iconv(tldnames, to = "UTF-8")
 27 |   return(tldnames)
 28 | }
 29 | 
 30 | #' Extract the top level domain, domain and subdomain from a host name
 31 | #'
 32 | #' Given one or more host names, this will return a data frame with four column:
 33 | #' \itemize{
 34 | #' \item "host": the original host name
 35 | #' \item "tld": the top level domain extracted
 36 | #' \item "domain" the domain extracted
 37 | #' \item "subdomain" one or more subdomains prepending the domain
 38 | #' }
 39 | #'
 40 | #' If a hostname is not understandable (no top level domain is matched), it will
 41 | #' return NA for the three components.
 42 | #'
 43 | #' @param host vector of one or more host names
 44 | #' @param tldnames vector of TLD names (see \code{\link{getTLD}})
 45 | #' @import data.table
 46 | #' @export
 47 | #' @seealso getTLD
 48 | #' @examples
 49 | #' \dontrun{
 50 | #' hosts <- tldextract(c("www.google.com", "testing.co.uk"), tldnames=getTLD())
 51 | #' }
 52 | tldextract <- function(host, tldnames=NULL) {
 53 |   if (missing(tldnames)) {
 54 |     data("tldnames", envir = environment())
 55 |   }
 56 |   wilds <- grepl('^\\*', tldnames)
 57 |   wildcard <- sub('\\*\\.', "", tldnames[wilds])
 58 |   static <- tldnames[!wilds]
 59 | 
 60 |   subdomain <- domain <- tld <- rep(NA_character_, length(host))
 61 |   splithosts <- strsplit(tolower(host), "[.]")
 62 |   names(splithosts) <- seq(length(splithosts))
 63 |   maxlen <- max(sapply(splithosts, length))
 64 |   for(split.after in seq(1, maxlen-1)) {
 65 |     templ <- sapply(splithosts, function(x)
 66 |       paste0(x[(split.after+1):length(x)], collapse=".")
 67 |     )
 68 |     matched <- templ %in% static
 69 |     if (any(matched)) {
 70 |       index <- as.numeric(names(splithosts)[matched])
 71 |       if (split.after>1) {
 72 |         subdomain[index] <- sapply(splithosts[matched], function(x) paste(x[1:(split.after-1)], collapse="."))
 73 |       }
 74 |       domain[index] <- sapply(splithosts[matched], function(x) unlist(x[split.after]))
 75 |       tld[index] <- sapply(splithosts[matched], function(x) paste(x[(split.after+1):length(x)], collapse="."))
 76 |     }
 77 |     # now the wildcard
 78 |     matched2 <- templ %in% wildcard
 79 |     if (any(matched2) && split.after > 1) {
 80 |       safter <-  split.after - 1
 81 |       index <- as.numeric(names(splithosts)[matched2])
 82 |       if (safter>1) {
 83 |         subdomain[index] <- sapply(splithosts[matched2], function(x) paste(x[1:(safter-1)], collapse="."))
 84 |       }
 85 |       domain[index] <- sapply(splithosts[matched2], function(x) x[safter])
 86 |       tld[index] <- sapply(splithosts[matched2], function(x) paste(x[(safter+1):length(x)], collapse="."))
 87 |     }
 88 |     if (any(matched2 | matched)) {
 89 |       splithosts <- splithosts[!(matched | matched2)]
 90 |       if(length(splithosts)<1) break
 91 |     }
 92 |   }
 93 |   data.frame(host=host, subdomain=subdomain, domain=domain, tld=tld, stringsAsFactors=F)
 94 | }
 95 | 
 96 | #' List of Top-Level Domain Names
 97 | #'
 98 | #' A dataset containing a single vector of the top-level domain names as retrevied from
 99 | #' \url{https://publicsuffix.org/list/effective_tld_names.dat} on August 2, 2014.
100 | #'
101 | #'  This can manually refreshed upon library load with the \code{\link{getTLD}} function.
102 | #'
103 | #'  Note, the non-ASCII characters may create a problem, and needs more testing.
104 | #'
105 | #' @docType data
106 | #' @keywords datasets
107 | #' @format A vector containing the top level domains
108 | #' @name tldnames
109 | NULL
110 | 


--------------------------------------------------------------------------------