├── tests ├── test-all.r └── testthat │ ├── data │ ├── wn-call-exec.rds │ ├── wn-call-info.rds │ ├── wn-ciao-exec.rds │ ├── wn-ciao-info.rds │ ├── wn-nine-exec.rds │ ├── wn-nine-info.rds │ ├── wn-abcdefghi-exec.rds │ └── wn-abcdefghi-info.rds │ ├── test-exec.r │ ├── test-mining.r │ ├── test-info.r │ └── test-cmd.r ├── R ├── search.r ├── messages.r ├── flatten.r ├── exec.r ├── cmd.r ├── hypernyms.r ├── assertions.r ├── info.r ├── wnwr.r └── mining.r ├── .gitignore ├── NAMESPACE ├── man ├── extract_gloss.Rd ├── delete_whit.Rd ├── delete_with.Rd ├── delete_sensenum.Rd ├── extract_offset.Rd ├── extract_lexinfo.Rd ├── build_list.Rd ├── identify_senses.Rd ├── identify_synsets.Rd └── wnwr.Rd ├── NEWS ├── DESCRIPTION └── README.md /tests/test-all.r: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | test_check('wnwr') -------------------------------------------------------------------------------- /R/search.r: -------------------------------------------------------------------------------- 1 | # search <- function(word) { 2 | # 3 | # } 4 | 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .R* 2 | *.so 3 | *.o 4 | *.dll 5 | *.Rproj 6 | *.sublime* 7 | .Rproj.user 8 | -------------------------------------------------------------------------------- /R/messages.r: -------------------------------------------------------------------------------- 1 | missing_message <- function(var) paste0("'", deparse(substitute(var)), "'", " arg is missing.") 2 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | S3method(print,command) 2 | S3method(print,result) 3 | export(has) 4 | export(hypernyms) 5 | export(info) 6 | -------------------------------------------------------------------------------- /tests/testthat/data/wn-call-exec.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leodido/wnwr/develop/tests/testthat/data/wn-call-exec.rds -------------------------------------------------------------------------------- /tests/testthat/data/wn-call-info.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leodido/wnwr/develop/tests/testthat/data/wn-call-info.rds -------------------------------------------------------------------------------- /tests/testthat/data/wn-ciao-exec.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leodido/wnwr/develop/tests/testthat/data/wn-ciao-exec.rds -------------------------------------------------------------------------------- /tests/testthat/data/wn-ciao-info.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leodido/wnwr/develop/tests/testthat/data/wn-ciao-info.rds -------------------------------------------------------------------------------- /tests/testthat/data/wn-nine-exec.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leodido/wnwr/develop/tests/testthat/data/wn-nine-exec.rds -------------------------------------------------------------------------------- /tests/testthat/data/wn-nine-info.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leodido/wnwr/develop/tests/testthat/data/wn-nine-info.rds -------------------------------------------------------------------------------- /tests/testthat/data/wn-abcdefghi-exec.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leodido/wnwr/develop/tests/testthat/data/wn-abcdefghi-exec.rds -------------------------------------------------------------------------------- /tests/testthat/data/wn-abcdefghi-info.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leodido/wnwr/develop/tests/testthat/data/wn-abcdefghi-info.rds -------------------------------------------------------------------------------- /man/extract_gloss.Rd: -------------------------------------------------------------------------------- 1 | \name{extract_gloss} 2 | \alias{extract_gloss} 3 | \title{Extracts gloss from a wn string} 4 | \usage{ 5 | extract_gloss(str) 6 | } 7 | \description{ 8 | It is vectorized. 9 | } 10 | 11 | -------------------------------------------------------------------------------- /man/delete_whit.Rd: -------------------------------------------------------------------------------- 1 | \name{delete_whit} 2 | \alias{delete_whit} 3 | \title{Removes elements that matches regexp} 4 | \usage{ 5 | delete_whit(str, regexp) 6 | } 7 | \description{ 8 | It is vectorized. 9 | } 10 | 11 | -------------------------------------------------------------------------------- /man/delete_with.Rd: -------------------------------------------------------------------------------- 1 | \name{delete_with} 2 | \alias{delete_with} 3 | \title{Removes elements that matches regexp} 4 | \usage{ 5 | delete_with(str, regexp) 6 | } 7 | \description{ 8 | It is vectorized. 9 | } 10 | 11 | -------------------------------------------------------------------------------- /man/delete_sensenum.Rd: -------------------------------------------------------------------------------- 1 | \name{delete_sensenum} 2 | \alias{delete_sensenum} 3 | \title{Removes synset number from a wn string} 4 | \usage{ 5 | delete_sensenum(str) 6 | } 7 | \description{ 8 | It is vectorized. 9 | } 10 | 11 | -------------------------------------------------------------------------------- /man/extract_offset.Rd: -------------------------------------------------------------------------------- 1 | \name{extract_offset} 2 | \alias{extract_offset} 3 | \title{Extracts synset offset from a wn string} 4 | \usage{ 5 | extract_offset(str) 6 | } 7 | \description{ 8 | It is vectorized. 9 | } 10 | 11 | -------------------------------------------------------------------------------- /man/extract_lexinfo.Rd: -------------------------------------------------------------------------------- 1 | \name{extract_lexinfo} 2 | \alias{extract_lexinfo} 3 | \title{Extracts lexicographer information from a wn string} 4 | \usage{ 5 | extract_lexinfo(str) 6 | } 7 | \description{ 8 | It is vectorized. 9 | } 10 | 11 | -------------------------------------------------------------------------------- /man/build_list.Rd: -------------------------------------------------------------------------------- 1 | \name{build_list} 2 | \alias{build_list} 3 | \title{Builds a list detecting the presence of a signal} 4 | \usage{ 5 | build_list(vect, signal = "=>\\\\s") 6 | } 7 | \description{ 8 | Builds a list detecting the presence of a signal 9 | } 10 | 11 | -------------------------------------------------------------------------------- /man/identify_senses.Rd: -------------------------------------------------------------------------------- 1 | \name{identify_senses} 2 | \alias{identify_senses} 3 | \title{Indexes a character vector identifying the senses which its elements belong} 4 | \usage{ 5 | identify_senses(synset_data) 6 | } 7 | \description{ 8 | It return an index. 9 | } 10 | 11 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | Version 0.2.0 2 | ------------------------------------------------------------------------------ 3 | * WordNet command executor 4 | * Word information extraction capability 5 | 6 | Version 0.1.0 7 | ------------------------------------------------------------------------------ 8 | * Wrapper to construct WordNet commands -------------------------------------------------------------------------------- /R/flatten.r: -------------------------------------------------------------------------------- 1 | flatten <- function(x) { 2 | y <- list() 3 | rapply(x, function(x) y <<- c(y,x)) 4 | return(y) 5 | } 6 | 7 | depth <- function(this, thisdepth = 0) { 8 | if(!is.list(this)) { 9 | return(thisdepth) 10 | } else{ 11 | return(max(unlist(lapply(this, depth, thisdepth = thisdepth + 1)))) 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /man/identify_synsets.Rd: -------------------------------------------------------------------------------- 1 | \name{identify_synsets} 2 | \alias{identify_synsets} 3 | \title{Indexes a character vector identifying the synset which its elements belong} 4 | \usage{ 5 | identify_synsets(data, 6 | synsets = unlist(unname(getOption("wnwr.supported.synset.types")))) 7 | } 8 | \description{ 9 | It returns an index. 10 | } 11 | 12 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: wnwr 2 | Title: A wordnet wrapper. 3 | Version: 0.2 4 | Author: Leonardo Di Donato 5 | Maintainer: Leonardo Di Donato 6 | Description: A wrapper for the WordNet command (i.e., wn). 7 | License: GPL-3 8 | URL: https://github.com/leodido/wnwr 9 | BugReports: https://github.com/leodido/wnwr/issues 10 | Depends: 11 | R (>= 3.0.0), 12 | assertthat, 13 | wordnet, 14 | stringr 15 | Imports: 16 | tools 17 | Suggests: 18 | testthat 19 | -------------------------------------------------------------------------------- /man/wnwr.Rd: -------------------------------------------------------------------------------- 1 | \docType{package} 2 | \name{wnwr} 3 | \alias{wnwr} 4 | \alias{wnwr-package} 5 | \title{wnwr provides ...} 6 | \description{ 7 | wnwr provides ... 8 | } 9 | \section{Package options}{ 10 | It uses the following \code{options} to configure 11 | behaviour: \itemize{ \item 12 | \code{wnwr.supported.search.types}: vector of supported 13 | search types \item \code{wnwr.supported.search.opts}: 14 | vector of supported search options \item 15 | \code{wnwr.supported.synset.types}: vector of supported 16 | synset types \item \code{wnwr.wn.command}: wordnet shell 17 | command } 18 | } 19 | 20 | -------------------------------------------------------------------------------- /R/exec.r: -------------------------------------------------------------------------------- 1 | exec <- function(cmd) { 2 | # check cmd argument 3 | if (missing(cmd)) stop(missing_message(cmd)) 4 | assert_that(is_command(cmd)) 5 | # command execution 6 | output <- suppressWarnings(tryCatch(system(cmd, intern = TRUE), error = I)) 7 | if (length(output) > 0) { 8 | attrs <- attributes(output) 9 | found_senses <- attrs$status 10 | return(structure(list(result = paste(output, collpase = ''), num_senses = found_senses), class = c('wn', 'result'))) 11 | } else { 12 | return(NULL) 13 | } 14 | } 15 | 16 | #' @export 17 | print.result <- function(x, ...) { 18 | cat( 19 | '* Result:\n', 20 | paste(delete_with(x$result, '\\s*'), collapse = '\n'), 21 | '\n* Number of senses:\n', 22 | x$num_senses, 23 | '\n', 24 | sep = '' 25 | ) 26 | } 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | wnwr [![Analytics](https://ga-beacon.appspot.com/UA-49657176-1/wnwr)](https://github.com/igrigorik/ga-beacon) 2 | ======= 3 | 4 | A wordnet wrapper. 5 | 6 | The current R package contains a set of functions to query the [wordnet](http://wordnet.princeton.edu) lexical database. 7 | 8 | ## Why ? 9 | 10 | ... 11 | 12 | ## Installation 13 | 14 | ... 15 | 16 | ## Usage 17 | 18 | ... 19 | 20 | ### Word informations 21 | 22 | ... 23 | 24 | * `info(word)` 25 | 26 | * `has(word, searchtype)` 27 | 28 | ### Search word informations 29 | 30 | ... 31 | 32 | * `search(word, searchtype, option, sense)` 33 | 34 | * `hypernyms(word, option, sense)` 35 | 36 | ## Requirements 37 | 38 | - R language (version >= 3.0) 39 | - The wordnet software ([download](http://wordnet.princeton.edu/wordnet/download/current-version])) 40 | 41 | ## License 42 | 43 | This package is released under the terms of the [GPL-3](http://opensource.org/licenses/GPL-3.0). 44 | -------------------------------------------------------------------------------- /tests/testthat/test-exec.r: -------------------------------------------------------------------------------- 1 | context('Command executor') 2 | 3 | test_that("errors related to the 'cmd' argument", { 4 | expect_that(exec("wn 'ciao'"), throws_error('not a wn command')) 5 | expect_that(exec(NULL), throws_error('not a wn command')) 6 | expect_that(exec(NA), throws_error('not a wn command')) 7 | expect_that(exec(NA_character_), throws_error('not a wn command')) 8 | expect_that(exec(1L), throws_error('not a wn command')) 9 | expect_that(exec(list()), throws_error('not a wn command')) 10 | expect_that(exec(), throws_error('arg is missing')) 11 | }) 12 | 13 | test_that("command execution", { 14 | files <- file.path('data', list.files('data', pattern = '-exec.rds$')) 15 | matches <- regexpr('-([^-]+)-', files) 16 | matches <- matches + 1 17 | attr(matches, 'match.length') <- attr(matches, 'match.length') - 2 18 | words <- regmatches(files, matches) 19 | invisible(sapply(seq_along(files), function(i) { 20 | expect_equal( 21 | exec(structure(paste0("wn '", words[[i]], "'"), class = c('wn', 'command'))), 22 | readRDS(files[[i]]) 23 | ) 24 | })) 25 | }) -------------------------------------------------------------------------------- /R/cmd.r: -------------------------------------------------------------------------------- 1 | wn_cmd <- function(word, search, opt = NULL, sense_num = NULL, info = FALSE) { 2 | # check word arg 3 | if (missing(word)) stop(missing_message(word)) 4 | assert_that(not_empty_string(word)) 5 | # check if we are in word information mode or not 6 | if (!info) { 7 | # check search arg 8 | if (missing(search)) stop(missing_message(search)) 9 | assert_that(not_empty_character_vector(search)) 10 | search <- match.arg(unique(search), unlist(flatten(getOption('wnwr.supported.search.types'))), several.ok = TRUE) 11 | # check opt arg 12 | current_opt <- as.list(environment())$opt 13 | opt <- match.arg(opt, getOption('wnwr.supported.search.opts')) 14 | } 15 | # construct wn command 16 | if (initDict()) { 17 | cmd <- paste(getOption('wnwr.wn.command'), shQuote(word)) 18 | if (info) return(structure(cmd, class = c('wn', 'command'))) 19 | if (identical(current_opt, opt)) cmd <- paste(cmd, opt, sep = ' -') 20 | if (!is.null(sense_num)) { 21 | assert_that(is.count(sense_num)) 22 | cmd <- paste(cmd, sense_num, sep = ' -n') 23 | } 24 | return(structure(paste(cmd, substring(paste(' -', search, collapse = '', sep = ''), 2)), class = c('wn', 'command'))) 25 | } else { 26 | stop('wordnet not found.') 27 | } 28 | } 29 | 30 | #' @export 31 | print.command <- function(x, ...) cat(x, '\n', sep = '') 32 | -------------------------------------------------------------------------------- /R/hypernyms.r: -------------------------------------------------------------------------------- 1 | #' @export 2 | hypernyms <- function(word, opt = NULL, sense_num = NULL) { 3 | this <- as.character(match.call()[[1]]) 4 | types <- getOption('wnwr.supported.search.types')[this][[1]] 5 | return(extract_tree(exec(wn_cmd(word, types, opt, sense_num)))) 6 | } 7 | 8 | # TODO: extract options 9 | # TODO: filter by sense number 10 | 11 | extract_tree <- function(data) { 12 | if (missing(data)) stop(missing_message(data)) 13 | if (is.null(data)) return(data) 14 | assert_that(is_result(data)) 15 | # remove empty string elements 16 | data <- delete_with(data$result, '\\s*') 17 | data <- delete_with(data, '\\d+\\ssense(s)?\\sof\\s.*') 18 | # retrieve supported synset types 19 | synsets <- unlist(unname(getOption('wnwr.supported.synset.types'))) 20 | # identify and index synset 21 | index <- identify_synsets(data, synsets) # FIXME: see (1) @ mining.r 22 | # subset by index as factor 23 | data <- setNames( 24 | lapply(split(data, unlist(index)), function(i) { 25 | i <- i[-1] 26 | if (length(i) > 0) i 27 | else NULL 28 | }), 29 | unique(names(index)) 30 | ) 31 | # identify senses of each synset and process results 32 | lapply(data, function(sset) { 33 | unname(lapply(split(sset, identify_senses(sset)), function(sense) { 34 | sense <- sense[-1] 35 | if (length(sense) > 0) { 36 | build_list(str_trim(sense, side = 'right')) 37 | } else { 38 | NULL 39 | } 40 | })) 41 | }) 42 | } 43 | -------------------------------------------------------------------------------- /R/assertions.r: -------------------------------------------------------------------------------- 1 | not_empty_string <- function(x) { 2 | assert_that(is.string(x)) 3 | noNA(x) && all(nchar(x) > 0) 4 | } 5 | on_failure(not_empty_string) <- function(call, env) { 6 | paste0(deparse(call$x), ' is an empty string.') 7 | } 8 | 9 | not_null <- function(x) !is.null(x) 10 | on_failure(not_null) <- function(call, env) { 11 | paste0(deparse(call$x), ' is null.') 12 | } 13 | 14 | not_empty_character_vector <- function(x) { 15 | assert_that(is.character(x)) 16 | noNA(x) && all(nchar(x) > 0) 17 | } 18 | on_failure(not_empty_character_vector) <- function(call, env) { 19 | paste0(deparse(call$x), ' contains empty or missing values') 20 | } 21 | 22 | # not_string <- function(x) !is.string(x) 23 | # on_failure(not_string) <- function(call, env) { 24 | # paste0(deparse(call$x), ' is a string.') 25 | # } 26 | 27 | has_class <- function(x, which) has_attr(x, 'class') && which %in% attr(x, 'class', exact = TRUE) 28 | on_failure(has_class) <- function(call, env) { 29 | paste0(deparse(call$x), " does not have class '", eval(call$which, env), "'.") 30 | } 31 | 32 | is_command <- function(x) is.string(x) && has_class(x, 'wn') && has_class(x, 'command') 33 | on_failure(is_command) <- function(call, env) { 34 | paste0(deparse(call$x), " is not a wn command.") 35 | } 36 | 37 | is_result <- function(x) is.list(x) && x %has_name% 'result' && x %has_name% 'num_senses' && has_class(x, 'wn') && has_class(x, 'result') 38 | on_failure(is_result) <- function(call, env) { 39 | paste0(deparse(call$x), " is not a wn result.") 40 | } -------------------------------------------------------------------------------- /R/info.r: -------------------------------------------------------------------------------- 1 | info_cmd <- function(word) { 2 | wn_cmd(word, info = TRUE) 3 | } 4 | 5 | extract_info <- function(info) { 6 | if (missing(info)) stop(missing_message(info)) 7 | if (is.null(info)) return(info) 8 | assert_that(is_result(info)) 9 | # remove empty string elements 10 | info <- delete_with(info$result, '\\s*') 11 | # retrieve supported search types 12 | types <- unlist(flatten(getOption('wnwr.supported.search.types'))) 13 | # retrieve supported synset types 14 | synsets <- unlist(unname(getOption('wnwr.supported.synset.types'))) 15 | # identify and index synset 16 | index <- identify_synsets(info, synsets) 17 | # subset by index as factor 18 | info <- setNames( 19 | lapply(split(info, unlist(index)), function(i) { 20 | i <- i[-1] 21 | if (length(i) > 0) i 22 | else NULL 23 | }), 24 | unique(names(index)) 25 | ) 26 | # no information detected for any synsets ? 27 | if (length(Filter(is.null, info)) == length(synsets)) return(info) 28 | # matching search types only in children with information (not null children) 29 | results <- lapply(Filter(Negate(is.null), info), function(i) { 30 | i <- sub(paste0('.*(', paste(types, collapse = '|'), ').*'), '\\1', i) 31 | i[i %in% types] 32 | }) 33 | return(c(results, Filter(is.null, info))[synsets]) 34 | } 35 | 36 | #' @export 37 | info <- function(word) { 38 | out <- exec(info_cmd(word)) 39 | return(extract_info(out)) 40 | } 41 | 42 | #' @export 43 | has <- function(word, search, synset = NULL, details = FALSE) { 44 | assert_that(is.logical(details)) 45 | info <- info(word) 46 | if (!is.null(synset)) { 47 | assert_that(not_empty_character_vector(synset)) 48 | synset <- match.arg(synset, unlist(unname(getOption('wnwr.supported.synset.types'))), several.ok = TRUE) 49 | info <- info[synset] 50 | } 51 | result <- vapply(info, function(x) search %in% x, logical(1)) 52 | if (!details) return(any(result)) 53 | else return(result) 54 | } 55 | # FIXME: if passed more than one search type it fails! 56 | 57 | -------------------------------------------------------------------------------- /R/wnwr.r: -------------------------------------------------------------------------------- 1 | #' wnwr provides ... 2 | #' 3 | #' @section Package options: 4 | #' 5 | #' It uses the following \code{options} to configure behaviour: 6 | #' \itemize{ 7 | #' \item \code{wnwr.supported.search.types}: vector of supported search types 8 | #' \item \code{wnwr.supported.search.opts}: vector of supported search options 9 | #' \item \code{wnwr.supported.synset.types}: vector of supported synset types 10 | #' \item \code{wnwr.wn.command}: wordnet shell command 11 | #' } 12 | #' @docType package 13 | #' @name wnwr 14 | NULL 15 | 16 | search_type <- list( 17 | antonyms = c('antsn', 'antsv', 'antsa', 'antsr'), 18 | hypernyms = c('hypen', 'hypev'), 19 | hyponyms = c('treen', 'treev'), # tree 20 | verb_entailment = c('entav'), 21 | synonyms = c('synsn', 'synsv', 'synsa', 'synsr'), # ordered by estimated frequency 22 | holonyms = c('hholn'), # hierarchical holonyms # c('holon', 'smemn', 'ssubn', 'sprtn') => all holonyms, member, substance, part of holonyms 23 | meronyms = c('hmern'), # hierarchical meronyms # c('meron', membn', 'subsn', 'partn') => all meronyms, has member meronyms, has substance meronyms, has part meronyms 24 | cause_to = c('causv'), 25 | pertainyms = c('perta', 'pertar'), 26 | attributes = c('attrn', 'attrna'), 27 | derived_forms = c('derin', 'deriv'), 28 | domain = c('domnn', 'domnv' , 'domna', 'domnr'), 29 | domain_terms = c('domtn', 'domtv', 'domta', 'domtr'), 30 | polysemy_count = c('famln', 'famlv', 'famla', 'famlr'), # familiarity 31 | verb_frames = c('framv'), 32 | sisters = c('coorn', 'coorv'), # coordinate terms 33 | synonym_groups = c('simsv'), # grouped by similarity of meaning 34 | compound_words = c('grepn', 'grepv', 'grepa' ,'grepr'), 35 | sense_overview = c('over') 36 | ) 37 | search_opt <- c('h', 'g', 'l', 'a', 'o', 's') 38 | synset_type <- list(n = 'noun', v = 'verb', a = 'adj', r = 'adv') 39 | wn_command <- 'wn' 40 | 41 | .onLoad <- function(libname, pkgname) { 42 | ops <- options() 43 | wnwr_opts <- list( 44 | wnwr.supported.search.types = search_type, 45 | wnwr.supported.search.opts = search_opt, 46 | wnwr.supported.synset.types = synset_type, 47 | wnwr.wn.command = wn_command 48 | ) 49 | to_set <- !(names(wnwr_opts) %in% names(ops)) 50 | if (any(to_set)) options(wnwr_opts[to_set]) 51 | invisible() 52 | } 53 | -------------------------------------------------------------------------------- /tests/testthat/test-mining.r: -------------------------------------------------------------------------------- 1 | context('Text mining') 2 | 3 | test_that("lexicographer information", { 4 | lex_1 <- c(' beef, beef cattle', ' cattle, cows, kine, oxen, Bos taurus', ' meat1') 5 | expect_equal( 6 | extract_lexinfo(lex_1), 7 | structure( 8 | list(c('noun.animal', 'noun.animal', 'noun.food'), c('beef, beef cattle', 'cattle, cows, kine, oxen, Bos taurus', 'meat1')), 9 | .Names = c('lexinfo', 'sense') 10 | ) 11 | ) 12 | lex_2 <- c('=> beef, beef cattle', ' => cattle, cows, kine, oxen, Bos taurus') 13 | expect_equal( 14 | extract_lexinfo(lex_2), 15 | structure(list(c('noun.animal', 'noun.animal'), c('beef, beef cattle', 'cattle, cows, kine, oxen, Bos taurus')), .Names = c('lexinfo', 'sense')) 16 | ) 17 | }) 18 | 19 | test_that("synset offset", { 20 | so_1 <- c('{02402425} cattle, cows, kine, oxen, Bos taurus', '{02404186} beef, beef cattle', '{00001740} entity {00001740}') 21 | expect_equal( 22 | extract_offset(so_1), 23 | structure( 24 | list(c('02402425', '02404186', '00001740'), c('cattle, cows, kine, oxen, Bos taurus', 'beef, beef cattle', 'entity {00001740}')), 25 | .Names = c('offset', 'sense') 26 | ) 27 | ) 28 | so_2 <- c('=> {02402425} cattle, cows, kine, oxen, Bos taurus', '=> {02404186} beef, beef cattle') 29 | expect_equal( 30 | extract_offset(so_2), 31 | structure(list(c('02402425', '02404186'), c('cattle, cows, kine, oxen, Bos taurus', 'beef, beef cattle')), .Names = c('offset', 'sense')) 32 | ) 33 | }) 34 | 35 | test_that("sense number deletion", { 36 | snd_1 <- 'ciao#1' 37 | expect_equal( 38 | delete_sensenum(snd_1), 39 | 'ciao' 40 | ) 41 | snd_2 <- c('cattle#1') 42 | expect_equal( 43 | delete_sensenum(snd_2), 44 | 'cattle' 45 | ) 46 | snd_3 <- c('cattle#1, cows#1, kine#1, oxen#1, Bos taurus#1', 'bovine#1', 'bovid#1') 47 | expect_equal( 48 | delete_sensenum(snd_3), 49 | c('cattle, cows, kine, oxen, Bos taurus', 'bovine', 'bovid') 50 | ) 51 | }) 52 | 53 | test_that("gloss and examples", { 54 | g_1 <- c( 55 | 'gripe, bitch, grouse, crab, beef, squawk, bellyache, holler -- (complain; "What was he hollering about?")', 56 | 'complain, kick, plain, sound off, quetch, kvetch -- (express complaints, discontent, displeasure, or unhappiness; "My mother complains all day"; "She has a lot to kick about")' 57 | ) 58 | expect_equal( 59 | extract_gloss(g_1), 60 | structure( 61 | list( 62 | c('complain', 'express complaints, discontent, displeasure, or unhappiness'), 63 | list('What was he hollering about?', c('My mother complains all day', 'She has a lot to kick about')), 64 | c('gripe, bitch, grouse, crab, beef, squawk, bellyache, holler', 'complain, kick, plain, sound off, quetch, kvetch') 65 | ), 66 | .Names = c('gloss', 'example', 'sense') 67 | ) 68 | ) 69 | }) 70 | -------------------------------------------------------------------------------- /R/mining.r: -------------------------------------------------------------------------------- 1 | #' Extracts lexicographer information from a wn string 2 | #' 3 | #' It is vectorized. 4 | extract_lexinfo <- function(str) { 5 | locat <- str_locate(str, '<\\w+\\.\\w+>\\s') 6 | lexinfo <- str_trim(substr(str, locat[, 'start'], locat[, 'end']), side = 'right') 7 | return(list( 8 | lexinfo = substring(lexinfo, 2, str_length(lexinfo) - 1), 9 | sense = substr(str, locat[, 'end'] + 1, str_length(str)) 10 | )) 11 | } 12 | 13 | #' Extracts gloss from a wn string 14 | #' 15 | #' It is vectorized. 16 | extract_gloss <- function(str) { 17 | data <- strsplit(str, ' -- ') 18 | glosses <- str_replace_all(sapply(data, `[[`, 2), '\\(|\\)|\\\\|\\"', '') 19 | glosses <- str_split_fixed(glosses, '; ', 2) 20 | return(list( 21 | gloss = glosses[, 1], 22 | example = str_split(glosses[, 2], '; '), 23 | sense = sapply(data, `[[`, 1) 24 | )) 25 | } 26 | 27 | #' Extracts synset offset from a wn string 28 | #' 29 | #' It is vectorized. 30 | extract_offset <- function(str) { 31 | locat <- str_locate(str, '\\{(\\d+)\\}\\s') 32 | offset <- str_trim(substr(str, locat[, 'start'], locat[, 'end']), side = 'right') 33 | return(list( 34 | offset = substring(offset, 2, str_length(offset) - 1), 35 | sense = substr(str, locat[, 'end'] + 1, str_length(str)) 36 | )) 37 | } 38 | 39 | #' Removes synset number from a wn string 40 | #' 41 | #' It is vectorized. 42 | delete_sensenum <- function(str) { 43 | return(gsub('\\#\\d+', '', str)) 44 | } 45 | 46 | #' Removes elements that matches regexp 47 | #' 48 | #' It is vectorized. 49 | delete_with <- function(str, regexp) { 50 | str[!grepl(paste0('^', regexp , '$'), str)] 51 | } 52 | 53 | #' Indexes a character vector identifying the synset which its elements belong 54 | #' 55 | #' It returns an index. 56 | identify_synsets <- function(data, synsets = unlist(unname(getOption('wnwr.supported.synset.types')))) { 57 | # identify and index synset 58 | synset_regex <- paste(synsets, collapse = '|') 59 | flags <- grepl(synset_regex, data) 60 | nms <- sub(paste0('.*?(', synset_regex, ').*'), '\\1', data) # non greedy match 61 | assert_that(length(flags) == length(nms)) 62 | nms <- nms[flags] 63 | index <- cumsum(flags) 64 | index <- flatten(tapply(index, index, function(i) setNames(i, rep(nms[i[1]], length(i))), simplify = FALSE)) 65 | # assert_that(length(unique(index)) == length(synsets), !any(unlist(index)) == 0) 66 | # FIXME: (1) this is a strong assumption, it assumes that each synset exists in data 67 | # FIXME: 1st solution: remove this check 68 | # FIXME: 2nd solution: when calling this pass the correct set of available synsets (obtainable via has function) 69 | return(index) 70 | } 71 | 72 | #' Indexes a character vector identifying the senses which its elements belong 73 | #' 74 | #' It return an index. 75 | identify_senses <- function(synset_data) { 76 | cumsum(grepl('Sense\\s\\d+', synset_data)) 77 | } 78 | 79 | #' Builds a list detecting the presence of a signal 80 | build_list <- function(vect, signal = '=>\\s') { 81 | num_spaces <- str_locate(vect, signal)[, 'start'] 82 | level_groups <- split(vect, num_spaces) 83 | depths <- names(level_groups) 84 | out <- lapply(seq_along(level_groups), function(i) { 85 | sub(paste0('^\\s{0,', depths[[i]], '}', signal), '', level_groups[[i]]) 86 | }) 87 | if (is.na(num_spaces[[1]])) out <- c(vect[[1]], out) 88 | out 89 | } 90 | 91 | -------------------------------------------------------------------------------- /tests/testthat/test-info.r: -------------------------------------------------------------------------------- 1 | context('Word informations') 2 | 3 | synset_types <- unlist(unname(synset_type)) 4 | search_types <- unlist(flatten(search_type)) 5 | 6 | test_that("errors related to the 'info' argument", { 7 | expect_that(extract_info(), throws_error('arg is missing')) 8 | expect_that(extract_info(NA), throws_error('not a wn result')) 9 | expect_that(extract_info(list()), throws_error('not a wn result')) 10 | expect_that(extract_info(c('')), throws_error('not a wn result')) 11 | expect_that(extract_info('single information'), throws_error('not a wn result')) 12 | expect_that(extract_info(NA_character_), throws_error('not a wn result')) 13 | }) 14 | 15 | test_that("information extraction", { 16 | expect_null(extract_info(NULL)) 17 | expect_null(extract_info(c())) 18 | # no information detected 19 | expect_equal( 20 | extract_info(structure(list(synset_types, 1), .Names = c('result', 'num_senses'), class = c('wn', 'result'))), 21 | structure(vector('list', length(synset_types)), .Names = synset_types) 22 | ) 23 | # some common cases 24 | obj_1 <- structure( 25 | list( 26 | c(synset_types[1], search_types[1], synset_types[2], search_types[2], synset_types[3], search_types[3], synset_types[4], search_types[4]), 27 | 1 28 | ), 29 | .Names = c('result', 'num_senses'), 30 | class = c('wn', 'result') 31 | ) 32 | expect_equal( 33 | extract_info(obj_1), 34 | structure(list(search_types[1], search_types[2], search_types[3], search_types[4]), .Names = synset_types) 35 | ) 36 | obj_2 <- structure( 37 | list( 38 | c(synset_types[1], search_types[10], synset_types[4], search_types[1:9], synset_types[3], search_types[11], synset_types[2]), 39 | 1 40 | ), 41 | .Names = c('result', 'num_senses'), 42 | class = c('wn', 'result') 43 | ) 44 | expect_equal( 45 | extract_info(obj_2), 46 | structure(list(search_types[10], NULL, search_types[11], search_types[1:9]), .Names = synset_types) 47 | ) 48 | # some real cases 49 | files <- file.path('data', list.files('data', pattern = '-info.rds$')) 50 | matches <- regexpr('-([^-]+)-', files) 51 | matches <- matches + 1 52 | attr(matches, 'match.length') <- attr(matches, 'match.length') - 2 53 | words <- regmatches(files, matches) 54 | invisible(sapply(seq_along(files), function(i) { 55 | expect_equal( 56 | info(words[[i]]), 57 | readRDS(files[[i]]) 58 | ) 59 | })) 60 | }) 61 | 62 | test_that("errors related to the 'word' argument", { 63 | expect_that(info(), throws_error('arg is missing')) 64 | expect_that(info(''), throws_error('an empty string')) 65 | expect_that(info(NULL), throws_error('not a string')) 66 | expect_that(info(NA), throws_error('not a string')) 67 | }) 68 | 69 | test_that("word information command syntax", { 70 | expect_equal(info_cmd('word'), structure(paste0(wn_command, " 'word'"), class = c('wn', 'command'))) 71 | }) 72 | 73 | test_that("errors related to the 'synset' argument", { 74 | expect_that(has('ciao', 'hypen', NA), throws_error('not a character vector')) 75 | expect_that(has('ciao', 'hypen', NA, details = TRUE), throws_error('not a character vector')) 76 | expect_that(has('ciao', 'hypen', list()), throws_error('not a character vector')) 77 | expect_that(has('ciao', 'hypen', c('')), throws_error('empty or missing values')) 78 | expect_that(has('ciao', 'hypen', NA_character_), throws_error('empty or missing values')) 79 | }) 80 | 81 | test_that("has information", { 82 | expect_true(has('ciao', 'hypen', synset = NULL)) 83 | expect_equal( 84 | has('ciao', 'hypen', synset = NULL, details = TRUE), 85 | structure(c(TRUE, FALSE, FALSE, FALSE), .Names = c("noun", "verb", "adj", "adv")) 86 | ) 87 | expect_true(has('ciao', 'hypen', c())) 88 | expect_equal( 89 | has('ciao', 'hypen', c(), details = TRUE), 90 | structure(c(TRUE, FALSE, FALSE, FALSE), .Names = c("noun", "verb", "adj", "adv")) 91 | ) 92 | }) 93 | -------------------------------------------------------------------------------- /tests/testthat/test-cmd.r: -------------------------------------------------------------------------------- 1 | context('Shell command syntax') 2 | 3 | search_types <- unlist(flatten(search_type)) 4 | 5 | test_that("Errors related to the 'word' argument", { 6 | expect_that(wn_cmd(), throws_error('is missing')) 7 | expect_that(wn_cmd(c()), throws_error('not a string')) 8 | expect_that(wn_cmd(c('x', 'y')), throws_error('not a string')) 9 | expect_that(wn_cmd(10), throws_error('not a string')) 10 | expect_that(wn_cmd(NA_integer_), throws_error('not a string')) 11 | expect_that(wn_cmd(NULL), throws_error('not a string')) 12 | expect_that(wn_cmd(NA), throws_error('not a string')) 13 | expect_that(wn_cmd(NA_character_), throws_error('empty string')) 14 | expect_that(wn_cmd(c('')), throws_error('empty string')) 15 | expect_that(wn_cmd(''), throws_error('empty string')) 16 | }) 17 | 18 | test_that("Errors related to the 'search' argument", { 19 | expect_that(wn_cmd('word'), throws_error('is missing')) 20 | expect_that(wn_cmd('word', c()), throws_error('not a character vector')) 21 | expect_that(wn_cmd('word', NA_integer_), throws_error('not a character vector')) 22 | expect_that(wn_cmd('word', NULL), throws_error('not a character vector')) 23 | expect_that(wn_cmd('word', NA), throws_error('not a character vector')) 24 | expect_that(wn_cmd('word', NA_character_), throws_error('contains empty or missing values')) 25 | expect_that(wn_cmd('word', ''), throws_error('contains empty or missing values')) 26 | expect_that(wn_cmd('word', c('')), throws_error('contains empty or missing values')) 27 | expect_that(wn_cmd('word', c('x')), throws_error('should be one of')) 28 | }) 29 | 30 | test_search_arg <- function(x) { 31 | expect_equal( 32 | wn_cmd('word', x), 33 | structure(paste0(wn_command, " 'word' ", substring(paste(' -', unique(x), collapse = '', sep = ''), 2)), class = c('wn', 'command')) 34 | ) 35 | } 36 | 37 | test_that('Supported search types', { 38 | # # all possible combinations 39 | # invisible(lapply(seq_len(length(search_types)), function(m) { 40 | # invisible(combn( 41 | # search_types, 42 | # m, 43 | # test_search_arg, 44 | # simplify = FALSE 45 | # )) 46 | # })) 47 | # one command, one search type 48 | invisible(combn( 49 | search_types, 50 | 1, 51 | test_search_arg, 52 | simplify = FALSE 53 | )) 54 | # one command, all search types 55 | invisible(combn( 56 | search_types, 57 | length(search_types), 58 | test_search_arg, 59 | simplify = FALSE 60 | )) 61 | # duplicates removed 62 | test_search_arg(rep(search_types[3], 3)) 63 | }) 64 | 65 | test_that("errors related to the 'opt' argument", { 66 | expect_that(wn_cmd('word', search_types[1], NA), throws_error('must be NULL or a character vector')) 67 | expect_that(wn_cmd('word', search_types[1], 1L), throws_error('must be NULL or a character vector')) 68 | expect_that(wn_cmd('word', search_types[1], 'x'), throws_error('should be one of')) 69 | expect_that(wn_cmd('word', search_types[1], c('x')), throws_error('should be one of')) 70 | expect_that(wn_cmd('word', search_types[1], NA_character_), throws_error('should be one of')) 71 | expect_that(wn_cmd('word', search_types[1], c('x', 'y')), throws_error('must be of length 1')) 72 | }) 73 | 74 | test_that("supported search options", { 75 | expect_equal(wn_cmd('word', search_types[1]), structure(paste0(wn_command, " 'word' -", search_types[1]), class = c('wn', 'command'))) 76 | expect_equal(wn_cmd('word', search_types[1], NULL), structure(paste0(wn_command, " 'word' -", search_types[1]), class = c('wn', 'command'))) 77 | # all supported options 78 | invisible(lapply( 79 | search_opt, 80 | function(o) expect_equal(wn_cmd('word', search_types[1], o), structure(paste0(wn_command, " 'word' -", o, " -", search_types[1]), class = c('wn', 'command'))) 81 | )) 82 | }) 83 | 84 | test_that("errors related to the 'sense_num' argument", { 85 | expect_that(wn_cmd('word', search_types[1], search_opt[1], letters[1:5L]), throws_error('not a count')) 86 | expect_that(wn_cmd('word', search_types[1], search_opt[1], 'abc'), throws_error('not a count')) 87 | expect_that(wn_cmd('word', search_types[1], search_opt[1], NA), throws_error('not a count')) 88 | expect_that(wn_cmd('word', search_types[1], search_opt[1], 1:10L), throws_error('not a count')) 89 | expect_that(wn_cmd('word', search_types[1], search_opt[1], NA_integer_), throws_error('missing value')) 90 | }) 91 | 92 | test_that("sense number filter", { 93 | expect_equal(wn_cmd('word', search_types[1]), structure(paste0(wn_command, " 'word' -", search_types[1]), class = c('wn', 'command'))) 94 | expect_equal(wn_cmd('word', search_types[1], search_opt[1]), structure(paste0(wn_command, " 'word' -", search_opt[1], " -", search_types[1]), class = c('wn', 'command'))) 95 | expect_equal(wn_cmd('word', search_types[1], search_opt[1], NULL), structure(paste0(wn_command, " 'word' -", search_opt[1], " -", search_types[1]), class = c('wn', 'command'))) 96 | expect_equal(wn_cmd('word', search_types[1], search_opt[1], c()), structure(paste0(wn_command, " 'word' -", search_opt[1], " -", search_types[1]), class = c('wn', 'command'))) 97 | expect_equal(wn_cmd('word', search_types[1], sense_num = 10L), structure(paste0(wn_command, " 'word' -n", 10L, " -", search_types[1]), class = c('wn', 'command'))) 98 | expect_equal(wn_cmd('word', search_types[1], sense_num = 1), structure(paste0(wn_command, " 'word' -n", 1, " -", search_types[1]), class = c('wn', 'command'))) 99 | expect_equal(wn_cmd('word', search_types[1], search_opt[1], c(1)), structure(paste0(wn_command, " 'word' -", search_opt[1], " -n", 1, " -", search_types[1]), class = c('wn', 'command'))) 100 | }) 101 | 102 | test_that("parameters ignored in word information mode", { 103 | expect_equal(wn_cmd('word', info = TRUE), structure(paste0(wn_command, " 'word'"), class = c('wn', 'command'))) 104 | expect_equal(wn_cmd('word', search_types[1], info = TRUE), structure(paste0(wn_command, " 'word'"), class = c('wn', 'command'))) 105 | expect_equal(wn_cmd('word', search_types, info = TRUE), structure(paste0(wn_command, " 'word'"), class = c('wn', 'command'))) 106 | expect_equal(wn_cmd('word', search_types[1], search_opt[1], info = TRUE), structure(paste0(wn_command, " 'word'"), class = c('wn', 'command'))) 107 | expect_equal(wn_cmd('word', search_types[1], letters[1:20], info = TRUE), structure(paste0(wn_command, " 'word'"), class = c('wn', 'command'))) 108 | expect_equal(wn_cmd('word', search_types[1], search_opt[1], 2L, info = TRUE), structure(paste0(wn_command, " 'word'"), class = c('wn', 'command'))) 109 | expect_equal(wn_cmd('word', search_types[1], search_opt[1], 1:2L, info = TRUE), structure(paste0(wn_command, " 'word'"), class = c('wn', 'command'))) 110 | }) 111 | --------------------------------------------------------------------------------