├── .RData
├── data
    ├── dutch.rda
    ├── tokens_spacy.rda
    ├── tokens_corenlp.rda
    ├── quote_punctuation.rda
    ├── tokens_dutchclauses.rda
    └── tokens_dutchquotes.rda
├── tests
    ├── testthat.R
    └── testthat
    │   ├── Rplots.pdf
    │   ├── test_reshape.r
    │   ├── test_find_nodes.r
    │   ├── test_corenlp.r
    │   └── test_alpino.r
├── Querying_dependency_trees.pdf
├── man
    ├── figures
    │   ├── README-fig1-1.png
    │   ├── README-tree-1.pdf
    │   ├── README-tree-2.pdf
    │   └── README-unnamed-chunk-17-1.png
    ├── pipe.Rd
    ├── dutch.Rd
    ├── tokens_spacy.Rd
    ├── tokens_corenlp.Rd
    ├── tokens_dutchquotes.Rd
    ├── quote_punctuation.Rd
    ├── tokens_dutchclauses.Rd
    ├── NOT.Rd
    ├── OR.Rd
    ├── AND.Rd
    ├── fill.Rd
    ├── print.tQuery.Rd
    ├── get_branch_id.Rd
    ├── get_long_ids.Rd
    ├── rsyntax_threads.Rd
    ├── unselect_nodes.Rd
    ├── reselect_nodes.Rd
    ├── BREAK.Rd
    ├── set_rsyntax_threads.Rd
    ├── subset_nodes.Rd
    ├── isolate_branch.Rd
    ├── mutate_nodes.Rd
    ├── selected_nodes.Rd
    ├── chop.Rd
    ├── remove_fill.Rd
    ├── copy_fill.Rd
    ├── remove_nodes.Rd
    ├── get_nodes.Rd
    ├── annotate_nodes.Rd
    ├── copy_nodes.Rd
    ├── split_UD_conj.Rd
    ├── as_tokenindex.Rd
    ├── cast_text.Rd
    ├── select_nodes.Rd
    ├── syntax_reader.Rd
    ├── apply_queries.Rd
    ├── climb_tree.Rd
    ├── annotate.Rd
    ├── annotate_tqueries.Rd
    ├── custom_fill.Rd
    ├── plot_tree.Rd
    ├── tquery.Rd
    ├── add_span_quotes.Rd
    └── nested_nodes.Rd
├── R
    ├── imports.r
    ├── utils-pipe.R
    ├── zzz.r
    ├── annotation_pipe.r
    ├── data.r
    ├── cast_annotations.r
    ├── util.R
    ├── deprecated.r
    ├── syntax_rules.r
    ├── melt_nodes.r
    ├── print_functions.r
    ├── filter_tokens.r
    ├── tokenbrowser.r
    ├── isolate_branches.r
    ├── token_index.r
    ├── find_nodes.r
    ├── annotate.r
    ├── applied_reshape.r
    └── recursive_search.r
├── .Rbuildignore
├── .gitignore
├── rsyntax.Rproj
├── DESCRIPTION
├── NAMESPACE
└── README.Rmd


/.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/.RData


--------------------------------------------------------------------------------
/data/dutch.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/data/dutch.rda


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(rsyntax)
3 | 
4 | test_check("rsyntax")
5 | 


--------------------------------------------------------------------------------
/data/tokens_spacy.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/data/tokens_spacy.rda


--------------------------------------------------------------------------------
/data/tokens_corenlp.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/data/tokens_corenlp.rda


--------------------------------------------------------------------------------
/data/quote_punctuation.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/data/quote_punctuation.rda


--------------------------------------------------------------------------------
/tests/testthat/Rplots.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/tests/testthat/Rplots.pdf


--------------------------------------------------------------------------------
/data/tokens_dutchclauses.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/data/tokens_dutchclauses.rda


--------------------------------------------------------------------------------
/data/tokens_dutchquotes.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/data/tokens_dutchquotes.rda


--------------------------------------------------------------------------------
/Querying_dependency_trees.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/Querying_dependency_trees.pdf


--------------------------------------------------------------------------------
/man/figures/README-fig1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/man/figures/README-fig1-1.png


--------------------------------------------------------------------------------
/man/figures/README-tree-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/man/figures/README-tree-1.pdf


--------------------------------------------------------------------------------
/man/figures/README-tree-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/man/figures/README-tree-2.pdf


--------------------------------------------------------------------------------
/R/imports.r:
--------------------------------------------------------------------------------
1 | ## Let roxygen2 add imports to namespace
2 | 
3 | #' @import data.table
4 | #' @import magrittr
5 | NULL
6 | 
7 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^Querying\_dependency\_trees\.pdf$
2 | 
3 | ^.*\.Rproj$
4 | ^\.Rproj\.user$
5 | ^README\.Rmd$
6 | ^README-.*\.png$
7 | 


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/rsyntax/HEAD/man/figures/README-unnamed-chunk-17-1.png


--------------------------------------------------------------------------------
/R/utils-pipe.R:
--------------------------------------------------------------------------------
 1 | #' Pipe operator
 2 | #'
 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
 4 | #'
 5 | #' @name %>%
 6 | #' @rdname pipe
 7 | #' @keywords internal
 8 | #' @export
 9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | NULL
12 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-pipe.R
 3 | \name{\%>\%}
 4 | \alias{\%>\%}
 5 | \title{Pipe operator}
 6 | \usage{
 7 | lhs \%>\% rhs
 8 | }
 9 | \description{
10 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/man/dutch.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.r
 3 | \docType{data}
 4 | \name{dutch}
 5 | \alias{dutch}
 6 | \title{Dutch lemma}
 7 | \format{
 8 | list
 9 | }
10 | \usage{
11 | data(dutch)
12 | }
13 | \description{
14 | Various categories of lemma, for use in syntax queries
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/tokens_spacy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.r
 3 | \docType{data}
 4 | \name{tokens_spacy}
 5 | \alias{tokens_spacy}
 6 | \title{Example tokens for spacy English}
 7 | \format{
 8 | data.frame
 9 | }
10 | \usage{
11 | data(tokens_spacy)
12 | }
13 | \description{
14 | Example tokens for spacy English
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/tokens_corenlp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.r
 3 | \docType{data}
 4 | \name{tokens_corenlp}
 5 | \alias{tokens_corenlp}
 6 | \title{Example tokens for coreNLP English}
 7 | \format{
 8 | data.frame
 9 | }
10 | \usage{
11 | data(tokens_corenlp)
12 | }
13 | \description{
14 | Example tokens for coreNLP English
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/tokens_dutchquotes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.r
 3 | \docType{data}
 4 | \name{tokens_dutchquotes}
 5 | \alias{tokens_dutchquotes}
 6 | \title{Example tokens for Dutch quotes}
 7 | \format{
 8 | data.frame
 9 | }
10 | \usage{
11 | data(tokens_dutchquotes)
12 | }
13 | \description{
14 | Example tokens for Dutch quotes
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/quote_punctuation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.r
 3 | \docType{data}
 4 | \name{quote_punctuation}
 5 | \alias{quote_punctuation}
 6 | \title{Quote punctuation}
 7 | \format{
 8 | character()
 9 | }
10 | \usage{
11 | data(quote_punctuation)
12 | }
13 | \description{
14 | Punctuation used in quotes, for use in syntax queries
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/tokens_dutchclauses.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.r
 3 | \docType{data}
 4 | \name{tokens_dutchclauses}
 5 | \alias{tokens_dutchclauses}
 6 | \title{Example tokens for Dutch clauses}
 7 | \format{
 8 | data.frame
 9 | }
10 | \usage{
11 | data(tokens_dutchclauses)
12 | }
13 | \description{
14 | Example tokens for Dutch clauses
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Example code in package build process
 6 | *-Ex.R
 7 | 
 8 | # udpipe models 
 9 | udpipe_models/*
10 | english*
11 | 
12 | # RStudio files
13 | .Rproj.user/
14 | 
15 | # produced vignettes
16 | vignettes/*.html
17 | vignettes/*.pdf
18 | .Rproj.user
19 | 
20 | # Rcpp files
21 | src/*.o
22 | src/*.so
23 | src/*.dll
24 | 
25 | .query_test.r
26 | 
27 | env/*
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/rsyntax.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | PackageRoxygenize: rd,collate,namespace,vignette
19 | 


--------------------------------------------------------------------------------
/man/NOT.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tquery.r
 3 | \name{NOT}
 4 | \alias{NOT}
 5 | \title{Use NOT search in tquery}
 6 | \usage{
 7 | NOT(...)
 8 | }
 9 | \arguments{
10 | \item{...}{name-value pairs for look-up terms. see ?query.}
11 | }
12 | \value{
13 | A list, to be used as input to \link{tquery}
14 | }
15 | \description{
16 | Use NOT search in tquery
17 | }
18 | \examples{
19 | tquery(NOT(POS='Noun'))  
20 | }
21 | 


--------------------------------------------------------------------------------
/man/OR.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tquery.r
 3 | \name{OR}
 4 | \alias{OR}
 5 | \title{Use OR search in tquery}
 6 | \usage{
 7 | OR(...)
 8 | }
 9 | \arguments{
10 | \item{...}{name-value pairs for look-up terms. see ?query.}
11 | }
12 | \value{
13 | A list, to be used as input to \link{tquery}
14 | }
15 | \description{
16 | Use OR search in tquery
17 | }
18 | \examples{
19 | tquery(OR(lemma = 'walk', POS='Noun'))
20 | }
21 | 


--------------------------------------------------------------------------------
/man/AND.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tquery.r
 3 | \name{AND}
 4 | \alias{AND}
 5 | \title{Use AND search in tquery}
 6 | \usage{
 7 | AND(...)
 8 | }
 9 | \arguments{
10 | \item{...}{name-value pairs for look-up terms. see ?query.}
11 | }
12 | \value{
13 | A list, to be used as input to \link{tquery}
14 | }
15 | \description{
16 | Use AND search in tquery
17 | }
18 | \examples{
19 | tquery(AND(lemma = 'walk', POS='Noun'))   ## is also the default
20 | }
21 | 


--------------------------------------------------------------------------------
/man/fill.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tquery.r
 3 | \name{fill}
 4 | \alias{fill}
 5 | \title{Specify custom fill behavior}
 6 | \usage{
 7 | fill(...)
 8 | }
 9 | \arguments{
10 | \item{...}{passes to custom_fill}
11 | }
12 | \value{
13 | Should not be used outside of \link{tquery}
14 | }
15 | \description{
16 | This is soft deprecated, with the new preferred function being custom_fill to avoid namespace conflicts with tidyr::fill() and data.table::fill()
17 | }
18 | 


--------------------------------------------------------------------------------
/R/zzz.r:
--------------------------------------------------------------------------------
1 | .onAttach <- function(...) {
2 |   max_threads = 2   ## has to be <= 2 for daily CRAN checks
3 |   if (data.table::getDTthreads() > max_threads) {
4 |     set_rsyntax_threads(max_threads)
5 |     packageStartupMessage(sprintf('rsyntax uses the data.table package, but limits the number of threads used:\n\t- data.table currently uses %s threads\n\t- rsyntax uses %s threads\n\nYou can use set_rsyntax_threads() to use all data.table threads, or set a specific number', data.table::getDTthreads(), max_threads))
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/man/print.tQuery.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print_functions.r
 3 | \name{print.tQuery}
 4 | \alias{print.tQuery}
 5 | \title{S3 print for tQuery class}
 6 | \usage{
 7 | \method{print}{tQuery}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{a tQuery}
11 | 
12 | \item{...}{not used}
13 | }
14 | \description{
15 | S3 print for tQuery class
16 | }
17 | \examples{
18 | q = tquery(label='quote',
19 |            children(relation='nmod:according_to', label='source',
20 |                     children(label='verb')))
21 | q 
22 | }
23 | 


--------------------------------------------------------------------------------
/R/annotation_pipe.r:
--------------------------------------------------------------------------------
 1 | annotation_pipe <- function(...) {
 2 |   l = list(...)
 3 |   invalid = !sapply(l, class) %in% c('tQuery','tReshape')
 4 |   if (any(invalid)) stop(sprintf('Items need to be tQuery or tReshape objects'))
 5 |   class(l) = c('annotationPipe', class(l))
 6 |   l
 7 | }
 8 | 
 9 | print.annotationPipe <- function(x, ...) {
10 |   print(x$reshape)
11 |   cat('\n')
12 |   for (i in seq_along(x$queries)) {
13 |     cat('\n')
14 |     if (is.null(names(x$queries)[i])) {
15 |       cat('tQuery', i, '\n')
16 |     } else print(names(x$queries)[i])
17 |     print(x$queries[[i]])
18 |   }
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/man/get_branch_id.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/isolate_branches.r
 3 | \name{get_branch_id}
 4 | \alias{get_branch_id}
 5 | \title{Add the branch id as a column to the tokenindex}
 6 | \usage{
 7 | get_branch_id(tokens)
 8 | }
 9 | \arguments{
10 | \item{tokens}{A tokenindex}
11 | }
12 | \value{
13 | the tokenindex
14 | }
15 | \description{
16 | After splitting trees into branches
17 | }
18 | \examples{
19 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
20 | tokens = as_tokenindex(tokens)
21 | 
22 | \donttest{
23 | tokens2 = isolate_branch(tokens, relation = 'relcl', copy_parent = TRUE)
24 | get_branch_id(tokens2)
25 | }
26 | }
27 | 


--------------------------------------------------------------------------------
/man/get_long_ids.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/melt_nodes.r
 3 | \name{get_long_ids}
 4 | \alias{get_long_ids}
 5 | \title{Get ids in various forms to extract token_ids}
 6 | \usage{
 7 | get_long_ids(..., select = NULL, with_fill = FALSE)
 8 | }
 9 | \arguments{
10 | \item{...}{Either a data.table with the columns doc_id, sentence and token_id, or the output of \link{apply_queries}}
11 | 
12 | \item{select}{If not null, a character vector for selecting column names}
13 | 
14 | \item{with_fill}{If TRUE, include the ids of the fill nodes}
15 | }
16 | \value{
17 | A data.table with the columns doc_id, sentence and token_id
18 | }
19 | \description{
20 | Get ids in various forms to extract token_ids
21 | }
22 | 


--------------------------------------------------------------------------------
/man/rsyntax_threads.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util.R
 3 | \name{rsyntax_threads}
 4 | \alias{rsyntax_threads}
 5 | \title{Get the number of threads to be used by rsyntax functions}
 6 | \usage{
 7 | rsyntax_threads()
 8 | }
 9 | \value{
10 | the setting for the number of threads used by rsyntax
11 | }
12 | \description{
13 | rsyntax relies heavily on the data.table package, which supports multithreading. 
14 | By default, the number of threads set by data.table are used, as you can see with \code{\link[data.table]{getDTthreads}}.
15 | With \code{\link{set_rsyntax_threads}} you can set the number of threads for rsyntax functions, without affecting the data.table settings.
16 | }
17 | \examples{
18 | rsyntax_threads()
19 | }
20 | 


--------------------------------------------------------------------------------
/man/unselect_nodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generic_reshape.r
 3 | \name{unselect_nodes}
 4 | \alias{unselect_nodes}
 5 | \title{Undo select_nodes}
 6 | \usage{
 7 | unselect_nodes(.tokens)
 8 | }
 9 | \arguments{
10 | \item{.tokens}{A tokenIndex in which nodes are selected with \link{select_nodes}.}
11 | }
12 | \value{
13 | A tokenIndex (without a .nodes attribute)
14 | }
15 | \description{
16 | Not strictly required. Only available for elegance and minor memory efficiency
17 | }
18 | \examples{
19 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
20 | 
21 | tq = tquery(relation = "relcl", label = "relative_clause")
22 | tokens = select_nodes(tokens, tq) 
23 | selected_nodes(tokens)
24 | 
25 | tokens = unselect_nodes(tokens)
26 | 
27 | is.null(attr(tokens, '.nodes'))
28 | }
29 | 


--------------------------------------------------------------------------------
/man/reselect_nodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generic_reshape.r
 3 | \name{reselect_nodes}
 4 | \alias{reselect_nodes}
 5 | \title{Within a chain of reshape operations, reapply the tquery}
 6 | \usage{
 7 | reselect_nodes(.tokens)
 8 | }
 9 | \arguments{
10 | \item{.tokens}{A tokenIndex in which nodes are selected with \link{select_nodes}.}
11 | }
12 | \value{
13 | A tokenIndex with a .nodes attribute
14 | }
15 | \description{
16 | Within a chain of reshape operations, reapply the tquery
17 | }
18 | \examples{
19 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
20 | 
21 | ## use a tquery to label the nodes that you want to manipulate
22 | tq = tquery(relation = "relcl", label = "relative_clause")
23 | 
24 | ## apply query to select nodes
25 | tokens2 = select_nodes(tokens, tq) 
26 | 
27 | ## reuses the tq, that is stored in tokens2
28 | ## this makes it easy to make the selection anew after a transformation
29 | tokens2 = reselect_nodes(tokens2)
30 | }
31 | 


--------------------------------------------------------------------------------
/R/data.r:
--------------------------------------------------------------------------------
 1 | #' Example tokens for Dutch quotes
 2 | #'
 3 | #' @docType data
 4 | #' @usage data(tokens_dutchquotes)
 5 | #' @format data.frame
 6 | 'tokens_dutchquotes'
 7 | 
 8 | #' Example tokens for Dutch clauses
 9 | #'
10 | #' @docType data
11 | #' @usage data(tokens_dutchclauses)
12 | #' @format data.frame
13 | 'tokens_dutchclauses'
14 | 
15 | #' Example tokens for coreNLP English
16 | #'
17 | #' @docType data
18 | #' @usage data(tokens_corenlp)
19 | #' @format data.frame
20 | 'tokens_corenlp'
21 | 
22 | #' Example tokens for spacy English
23 | #'
24 | #' @docType data
25 | #' @usage data(tokens_spacy)
26 | #' @format data.frame
27 | 'tokens_spacy'
28 | 
29 | #' Dutch lemma
30 | #'
31 | #' Various categories of lemma, for use in syntax queries
32 | #'
33 | #' @docType data
34 | #' @usage data(dutch)
35 | #' @format list
36 | 'dutch'
37 | 
38 | #' Quote punctuation
39 | #' 
40 | #' Punctuation used in quotes, for use in syntax queries
41 | #'
42 | #' @docType data
43 | #' @usage data(quote_punctuation)
44 | #' @format character()
45 | 'quote_punctuation'


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: rsyntax
 2 | Type: Package
 3 | Title: Extract Semantic Relations from Text by Querying and Reshaping Syntax
 4 | Version: 0.1.4
 5 | Date: 2022-06-06
 6 | Author: Kasper Welbers and Wouter van Atteveldt
 7 | Maintainer: Kasper Welbers <kasperwelbers@gmail.com>
 8 | Depends: 
 9 |     R (>= 3.2.0)
10 | Imports:
11 |     igraph,
12 |     tidyselect,
13 |     methods,
14 |     stringi,
15 |     digest,
16 |     rlang,
17 |     magrittr,
18 |     tokenbrowser,
19 |     base64enc,
20 |     png,
21 |     data.table (>= 1.11.8)
22 | Enhances:
23 |     spacyr
24 | LazyData: true
25 | Encoding: UTF-8
26 | Description: Various functions for querying and reshaping dependency trees, 
27 |     as for instance created with the 'spacyr' or 'udpipe' packages.
28 |     This enables the automatic extraction of useful semantic relations from texts,
29 |     such as quotes (who said what) and clauses (who did what). Method proposed in 
30 |     Van Atteveldt et al. (2017) <doi:10.1017/pan.2016.12>.
31 | License: GPL-3
32 | RoxygenNote: 7.1.2
33 | Suggests: 
34 |     testthat
35 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(print,tQuery)
 4 | export("%>%")
 5 | export(AND)
 6 | export(BREAK)
 7 | export(NOT)
 8 | export(OR)
 9 | export(add_span_quotes)
10 | export(annotate)
11 | export(annotate_nodes)
12 | export(annotate_tqueries)
13 | export(apply_queries)
14 | export(as_tokenindex)
15 | export(cast_text)
16 | export(children)
17 | export(chop)
18 | export(climb_tree)
19 | export(copy_fill)
20 | export(copy_nodes)
21 | export(custom_fill)
22 | export(fill)
23 | export(get_branch_id)
24 | export(get_long_ids)
25 | export(get_nodes)
26 | export(isolate_branch)
27 | export(mutate_nodes)
28 | export(not_children)
29 | export(not_parents)
30 | export(parents)
31 | export(plot_tree)
32 | export(remove_fill)
33 | export(remove_nodes)
34 | export(reselect_nodes)
35 | export(rsyntax_threads)
36 | export(select_nodes)
37 | export(selected_nodes)
38 | export(set_rsyntax_threads)
39 | export(split_UD_conj)
40 | export(subset_nodes)
41 | export(syntax_reader)
42 | export(tquery)
43 | export(unselect_nodes)
44 | import(data.table)
45 | import(magrittr)
46 | importFrom(magrittr,"%>%")
47 | 


--------------------------------------------------------------------------------
/man/BREAK.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tquery.r
 3 | \name{BREAK}
 4 | \alias{BREAK}
 5 | \title{A special NOT condition if depth > 1}
 6 | \usage{
 7 | BREAK(...)
 8 | }
 9 | \arguments{
10 | \item{...}{name-value pairs for look-up terms. see ?query.}
11 | }
12 | \value{
13 | A list, to be used as input to \link{tquery}
14 | }
15 | \description{
16 | If depth > 1 in the children, parents or fill function, the children/parents will
17 | be retrieved recursively (i.e. children, children of children, etc.).
18 | If the look-up conditions (e.g., relation = 'nsubj') are not satisfied, a node 
19 | will not be matched by the query, but the search will still continue for it's
20 | parents/children. The special BREAK look-up function allows you to specify a condition
21 | for breaking the recursive loop (lending it's name from the `break` in a for loop).
22 | An example is that you might want to stop the recursive loop in a custom_fill() once it encounters
23 | a nested sentence, such as a relative clause: custom_fill(BREAK(relation = 'relcl')).
24 | }
25 | \examples{
26 | tquery(NOT(POS='Noun'))  
27 | }
28 | 


--------------------------------------------------------------------------------
/man/set_rsyntax_threads.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util.R
 3 | \name{set_rsyntax_threads}
 4 | \alias{set_rsyntax_threads}
 5 | \title{Set number of threads to be used by rsyntax functions}
 6 | \usage{
 7 | set_rsyntax_threads(threads = NULL)
 8 | }
 9 | \arguments{
10 | \item{threads}{The number of threads to use. Cannot be higher than number of threads used by data.table, which you can change with \code{\link[data.table]{setDTthreads}}. If left empty (NULL), all data.table threads are used}
11 | }
12 | \value{
13 | Does not return a value. Sets the global 'rsyntax_threads' option.
14 | }
15 | \description{
16 | rsyntax relies heavily on the data.table package, which supports multithreading. 
17 | By default, the number of threads set by data.table are used, as you can see with \code{\link[data.table]{getDTthreads}}.
18 | Here you can set the number of threads for rsyntax functions, without affecting the data.table settings.
19 | }
20 | \examples{
21 | current_threads = rsyntax_threads()
22 | 
23 | set_rsyntax_threads(2)
24 | 
25 | ## undo change (necessary for CRAN checks)
26 | set_rsyntax_threads(current_threads)
27 | }
28 | 


--------------------------------------------------------------------------------
/man/subset_nodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generic_reshape.r
 3 | \name{subset_nodes}
 4 | \alias{subset_nodes}
 5 | \title{Subset a select_nodes selection}
 6 | \usage{
 7 | subset_nodes(.tokens, subset, copy = TRUE)
 8 | }
 9 | \arguments{
10 | \item{.tokens}{A tokenIndex in which nodes are selected with \link{select_nodes}.}
11 | 
12 | \item{subset}{A subset expression (that evaluates to a logical vector). The token column for each labeled node in the tquery can be referred to as label$column.}
13 | 
14 | \item{copy}{If TRUE, make a deep copy of .tokens. Use if output does not overwrite .tokens}
15 | }
16 | \value{
17 | A tokenIndex with a .nodes attribute
18 | }
19 | \description{
20 | Enables more control in reshape operations
21 | }
22 | \examples{
23 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
24 | 
25 | ## use a tquery to label the nodes that you want to manipulate
26 | tq = tquery(label='verb', children(relation='nsubj'))
27 | 
28 | ## apply query to select nodes
29 | tokens2 = select_nodes(tokens, tq) 
30 | 
31 | selected_nodes(tokens2)$nodes
32 | tokens2 = subset_nodes(tokens2, verb$relation == 'ROOT')
33 | selected_nodes(tokens2)$nodes
34 | }
35 | 


--------------------------------------------------------------------------------
/man/isolate_branch.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/isolate_branches.r
 3 | \name{isolate_branch}
 4 | \alias{isolate_branch}
 5 | \title{Isolate a branch in a dependency tree}
 6 | \usage{
 7 | isolate_branch(tokens, ..., copy_parent = TRUE, copy_parent_fill = TRUE)
 8 | }
 9 | \arguments{
10 | \item{tokens}{A tokenindex}
11 | 
12 | \item{...}{lookup arguments to find the node to split. For example, isolate_branch(tokens, relation='relcl') 
13 | isolates branches of which the top node (the new root) has the relation "relcl".}
14 | 
15 | \item{copy_parent}{If TRUE (default) copy the parent of the branch and include it in the isolated branch}
16 | 
17 | \item{copy_parent_fill}{If TRUE, also copy the parents fill nodes}
18 | }
19 | \value{
20 | the tokenindex
21 | }
22 | \description{
23 | cuts of a branch at the nodes that match the lookup arguents (...).
24 | A "tree_parent" column is added to the tokenindex, that indicates for the new roots
25 | which node the parent was.
26 | }
27 | \examples{
28 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
29 | tokens = as_tokenindex(tokens)
30 | 
31 | tokens2 = isolate_branch(tokens, relation = 'relcl', copy_parent = TRUE)
32 | tokens2
33 | \donttest{
34 | if (interactive()) plot_tree(tokens2)
35 | }
36 | }
37 | 


--------------------------------------------------------------------------------
/man/mutate_nodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generic_reshape.r
 3 | \name{mutate_nodes}
 4 | \alias{mutate_nodes}
 5 | \title{Mutate nodes}
 6 | \usage{
 7 | mutate_nodes(.tokens, node, ..., subset = NULL)
 8 | }
 9 | \arguments{
10 | \item{.tokens}{A tokenIndex in which nodes are selected with \link{select_nodes}.}
11 | 
12 | \item{node}{The name of the node that is to be mutated}
13 | 
14 | \item{...}{named arguments. The name should be a column in tokens}
15 | 
16 | \item{subset}{A subset expression (that evaluates to a logical vector). The token column for each labeled node in the tquery can be referred to as label$column.}
17 | }
18 | \value{
19 | A tokenIndex with a .nodes attribute
20 | }
21 | \description{
22 | Mutate nodes
23 | }
24 | \examples{
25 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
26 | 
27 | ## use a tquery to label the nodes that you want to manipulate
28 | tq = tquery(relation = "relcl", label = "relative_clause")
29 | 
30 | ## apply query to select nodes
31 | tokens2 = select_nodes(tokens, tq) 
32 | 
33 | ## as an example, we make the parent of the relative_clause
34 | ## nodes NA, effectively cutting of the relcl from the tree
35 | tokens2 = mutate_nodes(tokens2, "relative_clause", parent=NA)
36 | 
37 | tokens2
38 | }
39 | 


--------------------------------------------------------------------------------
/man/selected_nodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generic_reshape.r
 3 | \name{selected_nodes}
 4 | \alias{selected_nodes}
 5 | \title{If select_nodes() is used, the selected nodes can be extracted with selected_nodes().
 6 | This is mainly for internal use, but it can also be usefull for debugging, and to controll
 7 | loops of reshape operation (e.g. break if no selected nodes left)}
 8 | \usage{
 9 | selected_nodes(.tokens)
10 | }
11 | \arguments{
12 | \item{.tokens}{A tokenIndex in which nodes are selected with \link{select_nodes}.}
13 | }
14 | \value{
15 | A tokenIndex with a .nodes attribute
16 | }
17 | \description{
18 | If select_nodes() is used, the selected nodes can be extracted with selected_nodes().
19 | This is mainly for internal use, but it can also be usefull for debugging, and to controll
20 | loops of reshape operation (e.g. break if no selected nodes left)
21 | }
22 | \examples{
23 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
24 | 
25 | ## use a tquery to label the nodes that you want to manipulate
26 | tq = tquery(relation = "relcl", label = "relative_clause")
27 | 
28 | ## apply query to select nodes
29 | tokens2 = select_nodes(tokens, tq) 
30 | 
31 | ## Get selected nodes from tokenindex
32 | selected_nodes(tokens2)
33 | }
34 | 


--------------------------------------------------------------------------------
/man/chop.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/applied_reshape.r
 3 | \name{chop}
 4 | \alias{chop}
 5 | \title{Chop of a branch of the tree}
 6 | \usage{
 7 | chop(.tokens, ...)
 8 | }
 9 | \arguments{
10 | \item{.tokens}{A tokenIndex}
11 | 
12 | \item{...}{Arguments passed to tquery. For instance, relation = 'punct' cuts off all punctuation dependencies (in universal dependencies)}
13 | }
14 | \value{
15 | A tokenIndex with the rows of the nodes in the selected branches removed
16 | }
17 | \description{
18 | Using the query language for tquery, chop of the branch down from the node that is found
19 | }
20 | \examples{
21 | 
22 | spacy_conjunctions <- function(tokens) {
23 |   no_fill = c('compound*','case', 'relcl')
24 |   tq = tquery(label='target', NOT(relation = 'conj'),
25 |               rsyntax::fill(NOT(relation = no_fill), max_window = c(Inf,0)),
26 |               children(relation = 'conj', label='origin',
27 |                        rsyntax::fill(NOT(relation = no_fill), max_window=c(0,Inf))))
28 |   tokens = climb_tree(tokens, tq)
29 |   chop(tokens, relation = 'cc')
30 | }
31 | 
32 | ## spacy tokens for "Bob and John ate bread and drank wine"
33 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text5',]
34 | 
35 | tokens = spacy_conjunctions(tokens)
36 | tokens
37 | \donttest{
38 | if (interactive()) plot_tree(tokens)
39 | }
40 | }
41 | 


--------------------------------------------------------------------------------
/man/remove_fill.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generic_reshape.r
 3 | \name{remove_fill}
 4 | \alias{remove_fill}
 5 | \title{Remove fill}
 6 | \usage{
 7 | remove_fill(
 8 |   .tokens,
 9 |   node,
10 |   rm_subset_fill = NULL,
11 |   rm_subset = NULL,
12 |   keep_shared = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{.tokens}{A tokenIndex in which nodes are selected with \link{select_nodes}.}
17 | 
18 | \item{node}{The name of the node that is to be mutated}
19 | 
20 | \item{rm_subset_fill}{A subset on the fill nodes. Can only directly use token column. For example, use pos == 'VERB' to remove only verbs}
21 | 
22 | \item{rm_subset}{A subset expression (that evaluates to a logical vector) to more specifically specify which nodes to remove. The token column for each labeled node in the tquery can be referred to as label$column.}
23 | 
24 | \item{keep_shared}{If there is another node that has the same fill nodes, should the fill nodes that are shared also be removed?}
25 | }
26 | \value{
27 | A tokenIndex with a .nodes attribute
28 | }
29 | \description{
30 | Like remove_nodes, but only removing the fill nodes
31 | }
32 | \examples{
33 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text1',]
34 | 
35 | ## use a tquery to label the nodes that you want to manipulate
36 | tq = tquery(pos = 'VERB',
37 |             children(label = 'object', relation='dobj'))
38 | 
39 | ## apply query to select nodes
40 | tokens2 = select_nodes(tokens, tq) 
41 | 
42 | remove_fill(tokens2, 'object')
43 | }
44 | 


--------------------------------------------------------------------------------
/man/copy_fill.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generic_reshape.r
 3 | \name{copy_fill}
 4 | \alias{copy_fill}
 5 | \title{Copy nodes}
 6 | \usage{
 7 | copy_fill(
 8 |   .tokens,
 9 |   from_node,
10 |   to_node,
11 |   subset = NULL,
12 |   subset_fill = NULL,
13 |   only_new = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{.tokens}{A tokenIndex in which nodes are selected with \link{select_nodes}.}
18 | 
19 | \item{from_node}{The name of the node from which fill is copied}
20 | 
21 | \item{to_node}{The name of the node to which fill is copied}
22 | 
23 | \item{subset}{A subset expression (that evaluates to a logical vector). The token column for each labeled node in the tquery can be referred to as label$column.}
24 | 
25 | \item{subset_fill}{A subset on the fill nodes. Can only directly use token column. For example, use pos == 'VERB' to copy only verbs}
26 | 
27 | \item{only_new}{If TRUE, direct fill children will only be copied to to_node if it does not already have nodes of this relation. This is a good heuristic for dealing with argument drop.}
28 | }
29 | \value{
30 | A tokenIndex with a .nodes attribute
31 | }
32 | \description{
33 | Copy nodes
34 | }
35 | \examples{
36 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text1',]
37 | 
38 | tq = tquery(label='object', relation='dobj')
39 |             
40 | tokens2 = select_nodes(tokens, tq)
41 | selected_nodes(tokens2)
42 | 
43 | tokens3 = copy_nodes(tokens2, 'object', 'new_object')
44 | copy_fill(tokens3, 'object', 'new_object')
45 | }
46 | 


--------------------------------------------------------------------------------
/man/remove_nodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generic_reshape.r
 3 | \name{remove_nodes}
 4 | \alias{remove_nodes}
 5 | \title{Remove nodes}
 6 | \usage{
 7 | remove_nodes(
 8 |   .tokens,
 9 |   node,
10 |   rm_subset = NULL,
11 |   with_fill = TRUE,
12 |   rm_subset_fill = NULL,
13 |   keep_shared = FALSE
14 | )
15 | }
16 | \arguments{
17 | \item{.tokens}{A tokenIndex in which nodes are selected with \link{select_nodes}.}
18 | 
19 | \item{node}{The name of the node that is to be mutated}
20 | 
21 | \item{rm_subset}{A subset expression (that evaluates to a logical vector) to more specifically specify which nodes to remove. The token column for each labeled node in the tquery can be referred to as label$column.}
22 | 
23 | \item{with_fill}{If TRUE, also remove the fill nodes}
24 | 
25 | \item{rm_subset_fill}{A subset on the fill nodes. Can only directly use token column. For example, use pos == 'VERB' to remove only verbs}
26 | 
27 | \item{keep_shared}{If there is another node that has the same fill nodes, should the fill nodes that are shared also be removed?}
28 | }
29 | \value{
30 | A tokenIndex with a .nodes attribute
31 | }
32 | \description{
33 | Remove nodes
34 | }
35 | \examples{
36 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text1',]
37 | 
38 | ## use a tquery to label the nodes that you want to manipulate
39 | tq = tquery(pos = 'VERB',
40 |             children(label = 'object', relation='dobj'))
41 | 
42 | ## apply query to select nodes
43 | tokens2 = select_nodes(tokens, tq) 
44 | 
45 | remove_nodes(tokens2, 'object')
46 | remove_nodes(tokens2, 'object', with_fill=FALSE)
47 | }
48 | 


--------------------------------------------------------------------------------
/man/get_nodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotate.r
 3 | \name{get_nodes}
 4 | \alias{get_nodes}
 5 | \title{Transform the nodes to long format and match with token data}
 6 | \usage{
 7 | get_nodes(tokens, nodes, use = NULL, token_cols = c("token"))
 8 | }
 9 | \arguments{
10 | \item{tokens}{A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.}
11 | 
12 | \item{nodes}{A data.table, as created with \link{apply_queries}. Can be a list of multiple data.tables.}
13 | 
14 | \item{use}{Optionally, specify which columns from nodes to add. Other than convenient, this is slighly different 
15 | from subsetting the columns in 'nodes' beforehand if fill is TRUE. When the children are collected,
16 | the ids from the not-used columns are still blocked (see 'block')}
17 | 
18 | \item{token_cols}{A character vector, specifying which columns from tokens to include in the output}
19 | }
20 | \value{
21 | A data.table with the nodes in long format, and the specified token_cols attached
22 | }
23 | \description{
24 | Transform the nodes to long format and match with token data
25 | }
26 | \examples{
27 | ## spacy tokens for: Mary loves John, and Mary was loved by John
28 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
29 | 
30 | ## two simple example tqueries
31 | passive = tquery(pos = "VERB*", label = "predicate",
32 |                  children(relation = c("agent"), label = "subject"))
33 | active =  tquery(pos = "VERB*", label = "predicate",
34 |                  children(relation = c("nsubj", "nsubjpass"), label = "subject"))
35 | 
36 | nodes = apply_queries(tokens, pas=passive, act=active)
37 | get_nodes(tokens, nodes)
38 | }
39 | 


--------------------------------------------------------------------------------
/man/annotate_nodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotate.r
 3 | \name{annotate_nodes}
 4 | \alias{annotate_nodes}
 5 | \title{Annotate a tokenlist based on rsyntaxNodes}
 6 | \usage{
 7 | annotate_nodes(tokens, nodes, column)
 8 | }
 9 | \arguments{
10 | \item{tokens}{A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.}
11 | 
12 | \item{nodes}{An rsyntaxNodes A data.table, as created with \link{apply_queries}. Can be a list of multiple data.tables.}
13 | 
14 | \item{column}{The name of the column in which the annotations are added. The unique ids are added as [column]_id, and the fill values are added as [column]_fill.}
15 | }
16 | \value{
17 | The tokenIndex data.table with the annotation columns added
18 | }
19 | \description{
20 | Use rsyntaxNodes, as created with \link{tquery} and \link{apply_queries}, to annotate a tokenlist.
21 | Three columns will be added: a unique id for the query match, the labels assigned in the tquery, and a column with the fill level (0 is direct match, 1 is child of match, 2 is grandchild, etc.).
22 | }
23 | \details{
24 | Note that you can also directly use \link{annotate}.
25 | }
26 | \examples{
27 | ## spacy tokens for: Mary loves John, and Mary was loved by John
28 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
29 | 
30 | ## two simple example tqueries
31 | passive = tquery(pos = "VERB*", label = "predicate",
32 |                  children(relation = c("agent"), label = "subject"))
33 | active =  tquery(pos = "VERB*", label = "predicate",
34 |                  children(relation = c("nsubj", "nsubjpass"), label = "subject"))
35 | 
36 | nodes = apply_queries(tokens, pas=passive, act=active)
37 | annotate_nodes(tokens, nodes, 'clause')
38 | }
39 | 


--------------------------------------------------------------------------------
/man/copy_nodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generic_reshape.r
 3 | \name{copy_nodes}
 4 | \alias{copy_nodes}
 5 | \title{Copy nodes}
 6 | \usage{
 7 | copy_nodes(
 8 |   .tokens,
 9 |   node,
10 |   new,
11 |   subset = NULL,
12 |   keep_relation = TRUE,
13 |   copy_fill = FALSE,
14 |   subset_fill = NULL,
15 |   only_new = NULL
16 | )
17 | }
18 | \arguments{
19 | \item{.tokens}{A tokenIndex in which nodes are selected with \link{select_nodes}.}
20 | 
21 | \item{node}{The name of the node that is to be copied}
22 | 
23 | \item{new}{The name given to the copy}
24 | 
25 | \item{subset}{A subset expression (that evaluates to a logical vector). The token column for each labeled node in the tquery can be referred to as label$column.}
26 | 
27 | \item{keep_relation}{If FALSE, remove relation (making node a root)}
28 | 
29 | \item{copy_fill}{If TRUE, also copy the fill}
30 | 
31 | \item{subset_fill}{A subset on the fill nodes. Can only directly use token column. For example, use pos == 'VERB' to copy only verbs}
32 | 
33 | \item{only_new}{If TRUE, direct fill children will only be copied to to_node if it does not already have nodes of this relation. This is a good heuristic for dealing with argument drop.}
34 | }
35 | \value{
36 | A tokenIndex with a .nodes attribute
37 | }
38 | \description{
39 | Copy nodes
40 | }
41 | \examples{
42 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text1',]
43 | 
44 | tq = tquery(label='object', relation='dobj')
45 |             
46 | tokens2 = select_nodes(tokens, tq)
47 | selected_nodes(tokens2)
48 | 
49 | copy_nodes(tokens2, 'object', 'new_object')
50 | 
51 | tokens3 = copy_nodes(tokens2, 'object', 'new_object', copy_fill=TRUE)
52 | 
53 | \donttest{
54 | if (interactive()) plot_tree(tokens3, token, pos)
55 | }
56 | }
57 | 


--------------------------------------------------------------------------------
/man/split_UD_conj.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/applied_reshape.r
 3 | \name{split_UD_conj}
 4 | \alias{split_UD_conj}
 5 | \title{Split conjunctions for dependency trees in Universal Dependencies}
 6 | \usage{
 7 | split_UD_conj(
 8 |   tokens,
 9 |   conj_rel = "conj",
10 |   cc_rel = c("cc", "cc:preconj"),
11 |   unpack = T,
12 |   no_fill = NULL,
13 |   min_dist = 0,
14 |   max_dist = Inf,
15 |   right_fill_dist = T,
16 |   compound_rel = c("compound*", "flat"),
17 |   ...
18 | )
19 | }
20 | \arguments{
21 | \item{tokens}{a tokenIndex based on texts parsed with \code{\link[spacyr]{spacy_parse}} (with dependency=TRUE)}
22 | 
23 | \item{conj_rel}{The dependency relation for conjunctions. By default conj}
24 | 
25 | \item{cc_rel}{The dependency relation for the coordinating conjunction. By default cc. This will be removed.}
26 | 
27 | \item{unpack}{If TRUE (default), create separate branches for the parent and the node that inherits the parent position}
28 | 
29 | \item{no_fill}{Optionally, a character vector with relation types that will be excluded from fill}
30 | 
31 | \item{min_dist}{Optionally, a minimal distance between the conj node and its parent}
32 | 
33 | \item{max_dist}{Optionally, a maximum distance between the conj node and its parent}
34 | 
35 | \item{right_fill_dist}{Should fill to the right of the conjunction be used?}
36 | 
37 | \item{compound_rel}{The relation types indicating compounds}
38 | 
39 | \item{...}{specify conditions for the conjunction token. For instance, using 'pos = "VERB"' to only split VERB conjunctions.
40 | This is especially usefull to use different no_fill conditions.}
41 | }
42 | \value{
43 | A tokenindex
44 | }
45 | \description{
46 | Split conjunctions for dependency trees in Universal Dependencies
47 | }
48 | \examples{
49 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text5',]
50 | 
51 | if (interactive()) {
52 | tokens \%>\%
53 |    split_UD_conj() \%>\%
54 |    plot_tree()
55 | }
56 | }
57 | 


--------------------------------------------------------------------------------
/man/as_tokenindex.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/token_index.r
 3 | \name{as_tokenindex}
 4 | \alias{as_tokenindex}
 5 | \title{Prepare a tokenIndex}
 6 | \usage{
 7 | as_tokenindex(
 8 |   tokens,
 9 |   doc_id = c("doc_id", "document_id"),
10 |   sentence = c("sentence", "sentence_id"),
11 |   token_id = c("token_id"),
12 |   parent = c("parent", "head_token_id"),
13 |   relation = c("relation", "dep_rel"),
14 |   paragraph = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{tokens}{A data.frame, data.table, or tokenindex.}
19 | 
20 | \item{doc_id}{candidate names for the document id columns}
21 | 
22 | \item{sentence}{candidate names for sentence (id/index) column}
23 | 
24 | \item{token_id}{candidate names for the  token id column. Has to be numeric (Some parsers return token_id's as numbers with a prefix (t_1, w_1))}
25 | 
26 | \item{parent}{candidate names for the parent id column. Has to be numeric}
27 | 
28 | \item{relation}{candidate names for the relation column}
29 | 
30 | \item{paragraph}{Optionally, the name of a column with paragraph ids. This is only necessary if sentences are numbered per paragraph, and therefore not unique within documents. If given, sentences are re-indexed to be unique within documents.}
31 | }
32 | \value{
33 | a tokenIndex
34 | }
35 | \description{
36 | Creates a tokenIndex data.table. 
37 | Accepts any data.frame given that the required columns (doc_id, sentence, token_id, parent, relation) are present.
38 | The names of these columns must be one of the values specified in the respective arguments.
39 | 
40 | The data in the data.frame will not be changed, with three exceptions. First, the columnnames will be changed if the default values are not used.
41 | Second, if a token has itself as its parent (which in some parsers is used to indicate the root), the parent is set to NA (as used in other parsers) to prevent infinite cycles.
42 | Third, the data will be sorted by doc_id, sentence, token_id.
43 | }
44 | \examples{
45 | as_tokenindex(tokens_corenlp)
46 | }
47 | 


--------------------------------------------------------------------------------
/man/cast_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cast_annotations.r
 3 | \name{cast_text}
 4 | \alias{cast_text}
 5 | \title{Cast annotations to text}
 6 | \usage{
 7 | cast_text(tokens, annotation, ..., text_col = "token", na.rm = T)
 8 | }
 9 | \arguments{
10 | \item{tokens}{A tokenIndex}
11 | 
12 | \item{annotation}{The name of annotations (the "column" argument in annotate_tqueries)}
13 | 
14 | \item{...}{Optionally, group annotations together. Named arguments can be given
15 | where the name is the new group, and the value is a character vector with
16 | values in the annotation column. For example, text = c('verb','predicate') would 
17 | group the 'verb' and 'predicate' nodes together under the name 'text'.}
18 | 
19 | \item{text_col}{The name of the column in tokens with the text. Usually this is "token",
20 | but some parsers use alternatives such as 'word'.}
21 | 
22 | \item{na.rm}{If true (default), drop tokens where annotation id is NA (i.e. tokens without labels)}
23 | }
24 | \value{
25 | a data.table
26 | }
27 | \description{
28 | Cast labeled tokens to sentences.
29 | }
30 | \examples{
31 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
32 | 
33 | ## two simple example tqueries
34 | passive = tquery(pos = "VERB*", label = "verb", fill=FALSE,
35 |                  children(relation = "agent",
36 |                           children(label="subject")),
37 |                  children(relation = "nsubjpass", label="object"))
38 | active =  tquery(pos = "VERB*", label = "verb", fill=FALSE,
39 |                  children(relation = c("nsubj", "nsubjpass"), label = "subject"),
40 |                  children(relation = "dobj", label="object"))
41 | 
42 | tokens = annotate_tqueries(tokens, "clause", pas=passive, act=active, overwrite=T)
43 | 
44 | cast_text(tokens, 'clause')
45 | 
46 | ## group annotations
47 | cast_text(tokens, 'clause', text = c('verb','object'))
48 | 
49 | ## use grouping to sort
50 | cast_text(tokens, 'clause', subject = 'subject', 
51 |                             verb = 'verb', object = 'object')
52 | }
53 | 


--------------------------------------------------------------------------------
/man/select_nodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generic_reshape.r
 3 | \name{select_nodes}
 4 | \alias{select_nodes}
 5 | \title{Apply tquery to initiate reshape operations}
 6 | \usage{
 7 | select_nodes(
 8 |   tokens,
 9 |   tquery,
10 |   fill = TRUE,
11 |   fill_only_first = TRUE,
12 |   .one_per_sentence = FALSE,
13 |   .order = 1
14 | )
15 | }
16 | \arguments{
17 | \item{tokens}{A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.}
18 | 
19 | \item{tquery}{A \link{tquery} that selects and labels the nodes that are used in the reshape operations}
20 | 
21 | \item{fill}{Logical, should fill be used?}
22 | 
23 | \item{fill_only_first}{Logical, should a node only be filled once, with the nearest (first) labeled node?}
24 | 
25 | \item{.one_per_sentence}{If true, only one match per sentence is used, giving priority to paterns closest to the root (or fartest from the root if .order = -1). 
26 | This is sometimes necessary to deal with recursion.}
27 | 
28 | \item{.order}{If .one_per_sentence is used, .order determines whether the paterns closest to (1) or farthest away (-1) are used.}
29 | }
30 | \value{
31 | A tokenIndex with a .nodes attribute, that enables the use of reshape operations on the selected nodes
32 | }
33 | \description{
34 | Apply tquery to initiate reshape operations
35 | }
36 | \examples{
37 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
38 | 
39 | ## use a tquery to label the nodes that you want to manipulate
40 | tq = tquery(relation = "relcl", label = "relative_clause")
41 | 
42 | ## apply query to select nodes
43 | tokens2 = select_nodes(tokens, tq) 
44 | 
45 | ## as an example, we make the parent of the relative_clause
46 | ## nodes NA, effectively cutting of the relcl from the tree
47 | tokens2 = mutate_nodes(tokens2, "relative_clause", parent=NA)
48 | 
49 | tokens2
50 | 
51 | \donttest{
52 | if (interactive()) plot_tree(tokens2)
53 | 
54 | ## this is designed to work nicely with magrittr piping
55 | if (interactive()) {
56 | tokens \%>\%
57 |   select_nodes(tq) \%>\%
58 |   mutate_nodes("relative_clause", parent=NA) \%>\%
59 |   plot_tree()
60 | }
61 | }
62 | }
63 | 


--------------------------------------------------------------------------------
/tests/testthat/test_reshape.r:
--------------------------------------------------------------------------------
 1 | context("reshape")
 2 | 
 3 | test_that("reshape nodes works", {
 4 |   library(testthat)
 5 |   
 6 |   tokens = as_tokenindex(tokens_corenlp)
 7 |   tokens = tokens[tokens$sentence == 1,]
 8 |   
 9 |   tq = tquery(label='parent', POS = 'VB*',
10 |               children(label='child', relation='nsubj'))
11 |   
12 |   test = select_nodes(tokens, tq) %>%
13 |     mutate_nodes('child', token = parent$relation)
14 |   expect_equal(test$token[1], 'ROOT')
15 |   
16 |   test = select_nodes(tokens, tq) %>%
17 |     mutate_nodes('child', token = parent$relation, subset = child$token == 'Mary')
18 |   expect_equal(test$token[1], 'John')
19 |   
20 |   test = select_nodes(tokens, tq) %>%
21 |     copy_nodes('child', 'new_child') %>%
22 |     copy_nodes('child', 'new_child2') 
23 |   expect_equal(test$token_id[1:3], c(1,1.1,1.2))  ## id should count up correctly (1, 1.1, 1.2)
24 |   expect_equal(test$token[1:3], c('John','John','John'))
25 |   
26 |   test = select_nodes(tokens, tq) %>%
27 |     copy_nodes('child', 'new_child', subset = child$token == 'Mary')
28 |   expect_equal(test$token[2], 'says')
29 |   
30 |   
31 |   test = select_nodes(tokens, tq) %>%
32 |     copy_fill('parent', 'child')
33 |   expect_equal(test$token_id[3:4], c(3,3.1))
34 |   expect_equal(test$token[3:4], c('Mary','Mary'))
35 |   expect_equal(test$token_id[7:8], c(5,5.1))
36 |   expect_equal(test$token[7:8], c('great','great'))
37 |   
38 |   test = select_nodes(tokens, tq) %>%
39 |     copy_fill('parent', 'child', subset_fill = token %in% c('Mary','great'))
40 |   expect_true(sum(test$token == 'is') == 1) ## "is" should not be copied
41 |   
42 |   test = select_nodes(tokens, tq) %>%
43 |     copy_nodes('parent', 'new_child', copy_fill = TRUE)
44 |   expect_equal(test$token_id[9], 5.1)
45 |   expect_equal(test$parent[9], 2.1)
46 |   
47 |   test = select_nodes(tokens, tq) %>%
48 |     remove_nodes('parent')
49 |   expect_true(nrow(test) == 1)
50 |   
51 |   test = select_nodes(tokens, tq) %>%
52 |     remove_nodes('parent', with_fill = FALSE)
53 |   expect_true(nrow(test) == 5)
54 |   expect_true(sum(is.na(test$parent)) == 3) ## top layer fill of 'parent' now become roots
55 |   
56 | })
57 | 


--------------------------------------------------------------------------------
/man/syntax_reader.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenbrowser.r
 3 | \name{syntax_reader}
 4 | \alias{syntax_reader}
 5 | \title{Create a full text browser with highlighted rsyntax annotations}
 6 | \usage{
 7 | syntax_reader(
 8 |   tokens,
 9 |   annotation,
10 |   value = NULL,
11 |   value2 = NULL,
12 |   meta = NULL,
13 |   token_col = "token",
14 |   filename = NULL,
15 |   view = TRUE,
16 |   random_seed = NA,
17 |   ...
18 | )
19 | }
20 | \arguments{
21 | \item{tokens}{A tokenIndex}
22 | 
23 | \item{annotation}{The name of the column that contains the rsyntax annotation}
24 | 
25 | \item{value}{Optionally, a character vector with values in annotation. If used, only these values are fully colored, and the other (non NA) values only have border colors.}
26 | 
27 | \item{value2}{Optionally, a character vector with values in annotation other than those specified in 'value'. If used, only these values have border colors.}
28 | 
29 | \item{meta}{Optionally, a data.frame with document meta data. Has to have a column named doc_id of which the values match with the doc_id column in tokens}
30 | 
31 | \item{token_col}{The name of the column in tokens with the token text}
32 | 
33 | \item{filename}{Optionally, a filename to directly save the file. If not specified, a temporary file is created}
34 | 
35 | \item{view}{If TRUE, the browser will immediatly be viewed in the viewer panel}
36 | 
37 | \item{random_seed}{If a number is given, it is used as a seed to randomize the order of documents. This is usefull for
38 | validations purposes, because the doc_id in the tokenindex is sorted.}
39 | 
40 | \item{...}{Arguments passed to \link[tokenbrowser]{create_browser}}
41 | }
42 | \value{
43 | The url for the file
44 | }
45 | \description{
46 | Create a full text browser with highlighted rsyntax annotations
47 | }
48 | \examples{
49 | tokens = tokens_spacy
50 | 
51 | ## two simple example tqueries
52 | passive = tquery(pos = "VERB*", label = "predicate",
53 |                 children(relation = c("agent"), label = "subject"))
54 | active =  tquery(pos = "VERB*", label = "predicate",
55 |                 children(relation = c("nsubj", "nsubjpass"), label = "subject"))
56 | 
57 | 
58 | \donttest{
59 | tokens = annotate_tqueries(tokens, 'clause', pas=passive, act=active)
60 | syntax_reader(tokens, annotation = 'clause', value = 'subject')
61 | }
62 | }
63 | 


--------------------------------------------------------------------------------
/man/apply_queries.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/syntax_rules.r
 3 | \name{apply_queries}
 4 | \alias{apply_queries}
 5 | \title{Apply queries created with \link{tquery}}
 6 | \usage{
 7 | apply_queries(
 8 |   tokens,
 9 |   ...,
10 |   as_chain = FALSE,
11 |   block = NULL,
12 |   check = FALSE,
13 |   fill = TRUE,
14 |   return_wide = FALSE,
15 |   verbose = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{tokens}{A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.}
20 | 
21 | \item{...}{tqueries, as created with \link{tquery}. Can also be a list with tquery functions. It is recommended to use named arguments/lists, to name the tqueries.}
22 | 
23 | \item{as_chain}{If TRUE, Nodes that have already been assigned assigned earlier in the chain will be ignored (see 'block' argument).}
24 | 
25 | \item{block}{Optionally, specify ids (doc_id - sentence - token_id triples) where find_nodes will stop (ignoring the id and recursive searches through the id). 
26 | Can also be a data.table returned by (a previous) apply_queries, in which case all ids are blocked.}
27 | 
28 | \item{check}{If TRUE, return a warning if nodes occur in multiple patterns, which could indicate that the find_nodes query is not specific enough.}
29 | 
30 | \item{fill}{If TRUE (default) the fill nodes are added. Otherwise these are ignored, even if the queries include fill()}
31 | 
32 | \item{return_wide}{If TRUE, return nodes in wide format.}
33 | 
34 | \item{verbose}{If TRUE, report progress (only useful if multiple queries are used)}
35 | }
36 | \value{
37 | A data.table in which each row is a node for which all conditions are satisfied, and each column is one of the linked nodes 
38 |                (parents / children) with names as specified in the label argument.
39 | }
40 | \description{
41 | Apply queries created with \link{tquery}
42 | }
43 | \examples{
44 | ## spacy tokens for: Mary loves John, and Mary was loved by John
45 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
46 | 
47 | ## two simple example tqueries
48 | passive = tquery(pos = "VERB*", label = "predicate",
49 |                  children(relation = c("agent"), label = "subject"))
50 | active =  tquery(pos = "VERB*", label = "predicate",
51 |                  children(relation = c("nsubj", "nsubjpass"), label = "subject"))
52 | 
53 | nodes = apply_queries(tokens, pas=passive, act=active)
54 | nodes
55 | }
56 | 


--------------------------------------------------------------------------------
/R/cast_annotations.r:
--------------------------------------------------------------------------------
 1 | #' Cast annotations to text
 2 | #' 
 3 | #' Cast labeled tokens to sentences.  
 4 | #'
 5 | #' @param tokens       A tokenIndex
 6 | #' @param annotation   The name of annotations (the "column" argument in annotate_tqueries)
 7 | #' @param ...          Optionally, group annotations together. Named arguments can be given
 8 | #'                     where the name is the new group, and the value is a character vector with
 9 | #'                     values in the annotation column. For example, text = c('verb','predicate') would 
10 | #'                     group the 'verb' and 'predicate' nodes together under the name 'text'.
11 | #' @param text_col     The name of the column in tokens with the text. Usually this is "token",
12 | #'                     but some parsers use alternatives such as 'word'.
13 | #' @param na.rm        If true (default), drop tokens where annotation id is NA (i.e. tokens without labels)
14 | #'
15 | #' @return             a data.table
16 | #' @export
17 | #'
18 | #' @examples
19 | #' tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
20 | #' 
21 | #' ## two simple example tqueries
22 | #' passive = tquery(pos = "VERB*", label = "verb", fill=FALSE,
23 | #'                  children(relation = "agent",
24 | #'                           children(label="subject")),
25 | #'                  children(relation = "nsubjpass", label="object"))
26 | #' active =  tquery(pos = "VERB*", label = "verb", fill=FALSE,
27 | #'                  children(relation = c("nsubj", "nsubjpass"), label = "subject"),
28 | #'                  children(relation = "dobj", label="object"))
29 | #' 
30 | #' tokens = annotate_tqueries(tokens, "clause", pas=passive, act=active, overwrite=T)
31 | #' 
32 | #' cast_text(tokens, 'clause')
33 | #' 
34 | #' ## group annotations
35 | #' cast_text(tokens, 'clause', text = c('verb','object'))
36 | #' 
37 | #' ## use grouping to sort
38 | #' cast_text(tokens, 'clause', subject = 'subject', 
39 | #'                             verb = 'verb', object = 'object')
40 | cast_text <- function(tokens, annotation, ..., text_col='token', na.rm=T) {
41 |   ann_id = NULL; ann = NULL
42 |   cols = list(...)
43 |   
44 |   d = subset(tokens, select = c('doc_id',text_col, paste0(annotation, c('','_id'))))
45 |   data.table::setnames(d, new=c('doc_id','text','ann','ann_id'))
46 |   if (na.rm) d = subset(d, !is.na(ann_id))
47 |   
48 |   for (col in names(cols)) {
49 |     d[list(cols[[col]]), ann := col, on='ann']
50 |   }
51 |   
52 |   d = data.table::dcast(doc_id + ann_id ~ ann, fun.aggregate=paste, collapse=' ', value.var='text', data=d)
53 |   if (length(cols) > 0) data.table::setcolorder(d, c('doc_id','ann_id',names(cols)))
54 |   d
55 | }


--------------------------------------------------------------------------------
/man/climb_tree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/applied_reshape.r
 3 | \name{climb_tree}
 4 | \alias{climb_tree}
 5 | \title{Have a node adopt its parent's position}
 6 | \usage{
 7 | climb_tree(
 8 |   .tokens,
 9 |   tq,
10 |   unpack = TRUE,
11 |   isolate = TRUE,
12 |   take_fill = TRUE,
13 |   give_fill = TRUE,
14 |   only_new = "relation",
15 |   max_iter = 200
16 | )
17 | }
18 | \arguments{
19 | \item{.tokens}{A tokenIndex}
20 | 
21 | \item{tq}{A tquery. Needs to have a node labeled "origin" that has a parent labeled "target"}
22 | 
23 | \item{unpack}{If TRUE (default), create separate branches for the parent and the node that inherits the parent position}
24 | 
25 | \item{isolate}{If unpack is TRUE and isolate is TRUE (default is FALSE), isolate the new branch by recursively unpacking}
26 | 
27 | \item{take_fill}{If TRUE (default), give the node that will inherit the parent position a copy of the parent children (but only if it does not already have children with this relation; see only_new)}
28 | 
29 | \item{give_fill}{If TRUE (default), copy the children of the node that will inherit the parent position to the parent (but only if it does not already have children with this relation; see only_new)}
30 | 
31 | \item{only_new}{A characetr vector giving one or multiple column names that need to be unique for take_fill and give_fill}
32 | 
33 | \item{max_iter}{The climb tree function repeatedly resolves the first conjunction it encounters in a sentence. This can lead to many iterations
34 | for sentences with many (nested) conjunctions. It could be the case that in unforseen cases or with certain parsers
35 | an infinite loop is reached, which is why we use a max_iter argument that breaks the loop and sends a warning if the max is reached.}
36 | }
37 | \value{
38 | The reshaped tokenIndex
39 | }
40 | \description{
41 | given a tquery that identfies a node labeled "origin", that has a parent labeled "target", 
42 | recursively have child adopt the parent's position (parent and relation column)
43 | and adopt parents fill nodes. only_new restricts adding fill nodes to relations that child
44 | does not already have. This seems to be a good heuristic for dealing with argument drop
45 | }
46 | \examples{
47 | 
48 | spacy_conjunctions <- function(tokens) {
49 |   no_fill = c('compound*','case', 'relcl')
50 |   tq = tquery(label='target', NOT(relation = 'conj'),
51 |               rsyntax::fill(NOT(relation = no_fill), max_window = c(Inf,0)),
52 |               children(relation = 'conj', label='origin',
53 |                        rsyntax::fill(NOT(relation = no_fill), max_window=c(0,Inf))))
54 |   tokens = climb_tree(tokens, tq)
55 |   chop(tokens, relation = 'cc')
56 | }
57 | 
58 | ## spacy tokens for "Bob and John ate bread and drank wine"
59 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text5',]
60 | 
61 | tokens = spacy_conjunctions(tokens)
62 | 
63 | tokens
64 | \donttest{
65 | if (interactive()) plot_tree(tokens)
66 | }
67 | }
68 | 


--------------------------------------------------------------------------------
/R/util.R:
--------------------------------------------------------------------------------
 1 | get_children_i <- function(tokens, i) {
 2 |   tokens = as_tokenindex(tokens)
 3 |   select = tokens[i,c('doc_id','sentence','token_id'), with=FALSE]
 4 |   data.table::setnames(select, c('doc_id','sentence','parent'))
 5 |   children = tokens[select, on=c('doc_id','sentence','parent'), nomatch=0, which=TRUE]
 6 |   if (length(children) > 0) children = union(children, get_children_i(tokens, children)) 
 7 |   union(i, children)
 8 | }
 9 | 
10 | 
11 | 
12 | #' Set number of threads to be used by rsyntax functions
13 | #' 
14 | #' rsyntax relies heavily on the data.table package, which supports multithreading. 
15 | #' By default, the number of threads set by data.table are used, as you can see with \code{\link[data.table]{getDTthreads}}.
16 | #' Here you can set the number of threads for rsyntax functions, without affecting the data.table settings.
17 | #'
18 | #' @param threads The number of threads to use. Cannot be higher than number of threads used by data.table, which you can change with \code{\link[data.table]{setDTthreads}}. If left empty (NULL), all data.table threads are used
19 | #'
20 | #' @return Does not return a value. Sets the global 'rsyntax_threads' option.
21 | #' @export
22 | #'
23 | #' @examples
24 | #' current_threads = rsyntax_threads()
25 | #' 
26 | #' set_rsyntax_threads(2)
27 | #' 
28 | #' ## undo change (necessary for CRAN checks)
29 | #' set_rsyntax_threads(current_threads)
30 | set_rsyntax_threads <- function(threads=NULL) {
31 |   options(rsyntax_threads = min(threads, data.table::getDTthreads()))
32 | }
33 | 
34 | #' Get the number of threads to be used by rsyntax functions
35 | #' 
36 | #' rsyntax relies heavily on the data.table package, which supports multithreading. 
37 | #' By default, the number of threads set by data.table are used, as you can see with \code{\link[data.table]{getDTthreads}}.
38 | #' With \code{\link{set_rsyntax_threads}} you can set the number of threads for rsyntax functions, without affecting the data.table settings.
39 | #'
40 | #' @return the setting for the number of threads used by rsyntax
41 | #' @export
42 | #'
43 | #' @examples
44 | #' rsyntax_threads()
45 | rsyntax_threads <- function() {
46 |   go = options('rsyntax_threads')
47 |   if (is.null(go$rsyntax_threads)) data.table::getDTthreads() else min(go$rsyntax_threads, data.table::getDTthreads())
48 | }
49 | 
50 | bquote_s <- function(expr, where=parent.frame()) {
51 |   ## bquote, but for an expression that is already substituted
52 |   unquote <- function(e) if (is.pairlist(e)) 
53 |     as.pairlist(lapply(e, unquote))
54 |   else if (length(e) <= 1L) 
55 |     e
56 |   else if (e[[1L]] == as.name(".")) 
57 |     eval(e[[2L]], where)
58 |   else as.call(lapply(e, unquote))
59 |   unquote(expr)
60 | }
61 | 
62 | rm_nodes <- function(nodes, ids) {
63 |   if (ncol(nodes) > 1) {
64 |     drop = rep(TRUE, nrow(nodes))
65 |     for (j in 2:ncol(nodes)) {
66 |       drop = drop & nodes[[j]] %in% ids
67 |     }
68 |     nodes = nodes[!drop,]
69 |   }
70 |   nodes
71 | }
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/man/annotate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deprecated.r
 3 | \name{annotate}
 4 | \alias{annotate}
 5 | \title{Annotate a tokenlist based on rsyntax queries}
 6 | \usage{
 7 | annotate(
 8 |   tokens,
 9 |   column,
10 |   ...,
11 |   block = NULL,
12 |   fill = TRUE,
13 |   overwrite = FALSE,
14 |   block_fill = FALSE,
15 |   copy = TRUE,
16 |   verbose = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{tokens}{A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.}
21 | 
22 | \item{column}{The name of the column in which the annotations are added. The unique ids are added as column_id}
23 | 
24 | \item{...}{One or multiple tqueries, or a list of queries, as created with \link{tquery}. Queries can be given a named by using a named argument, which will be used in the annotation_id to keep track of which query was used.}
25 | 
26 | \item{block}{Optionally, specify ids (doc_id - sentence - token_id triples) that are blocked from querying and filling (ignoring the id and recursive searches through the id).}
27 | 
28 | \item{fill}{Logical. If TRUE (default) also assign the fill nodes (as specified in the tquery). Otherwise these are ignored}
29 | 
30 | \item{overwrite}{If TRUE, existing column will be overwritten. Otherwise (default), the exsting annotations in the column will be blocked, and new annotations will be added. This is identical to using multiple queries.}
31 | 
32 | \item{block_fill}{If TRUE (and overwrite is FALSE), the existing fill nodes will also be blocked. In other words, the new annotations will only be added if the}
33 | 
34 | \item{copy}{If TRUE (default), the data.table is copied. Otherwise, it is changed by reference. Changing by reference is faster and more memory efficient, but is not predictable R style, so is optional.}
35 | 
36 | \item{verbose}{If TRUE, report progress (only usefull if multiple queries are given)}
37 | }
38 | \value{
39 | The tokenIndex with the annotation columns
40 | }
41 | \description{
42 | This function has been renamed to annotate_tqueries.
43 | }
44 | \details{
45 | Apply queries to extract syntax patterns, and add the results as two columns to a tokenlist.
46 | One column contains the ids for each hit. The other column contains the annotations.
47 | Only nodes that are given a name in the tquery (using the 'label' parameter) will be added as annotation.
48 | 
49 | Note that while queries only find 1 node for each labeld component of a pattern (e.g., quote queries have 1 node for "source" and 1 node for "quote"), 
50 | all children of these nodes can be annotated by settting fill to TRUE. If a child has multiple ancestors, only the most direct ancestors are used (see documentation for the fill argument).
51 | }
52 | \examples{
53 | ## spacy tokens for: Mary loves John, and Mary was loved by John
54 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
55 | 
56 | ## two simple example tqueries
57 | passive = tquery(pos = "VERB*", label = "predicate",
58 |                  children(relation = c("agent"), label = "subject"))
59 | active =  tquery(pos = "VERB*", label = "predicate",
60 |                  children(relation = c("nsubj", "nsubjpass"), label = "subject"))
61 | 
62 | \donttest{ 
63 | tokens = annotate_tqueries(tokens, "clause", pas=passive, act=active)
64 | tokens
65 | if (interactive()) plot_tree(tokens, annotation='clause')
66 | }
67 | }
68 | 


--------------------------------------------------------------------------------
/R/deprecated.r:
--------------------------------------------------------------------------------
 1 | #' Annotate a tokenlist based on rsyntax queries
 2 | #'
 3 | #' This function has been renamed to annotate_tqueries.
 4 | #'
 5 | #' Apply queries to extract syntax patterns, and add the results as two columns to a tokenlist.
 6 | #' One column contains the ids for each hit. The other column contains the annotations.
 7 | #' Only nodes that are given a name in the tquery (using the 'label' parameter) will be added as annotation.
 8 | #' 
 9 | #' Note that while queries only find 1 node for each labeld component of a pattern (e.g., quote queries have 1 node for "source" and 1 node for "quote"), 
10 | #' all children of these nodes can be annotated by settting fill to TRUE. If a child has multiple ancestors, only the most direct ancestors are used (see documentation for the fill argument).
11 | #' 
12 | #' @param tokens      A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.
13 | #' @param column      The name of the column in which the annotations are added. The unique ids are added as column_id
14 | #' @param ...         One or multiple tqueries, or a list of queries, as created with \link{tquery}. Queries can be given a named by using a named argument, which will be used in the annotation_id to keep track of which query was used. 
15 | #' @param block       Optionally, specify ids (doc_id - sentence - token_id triples) that are blocked from querying and filling (ignoring the id and recursive searches through the id). 
16 | #' @param fill        Logical. If TRUE (default) also assign the fill nodes (as specified in the tquery). Otherwise these are ignored 
17 | #' @param overwrite   If TRUE, existing column will be overwritten. Otherwise (default), the exsting annotations in the column will be blocked, and new annotations will be added. This is identical to using multiple queries.
18 | #' @param block_fill  If TRUE (and overwrite is FALSE), the existing fill nodes will also be blocked. In other words, the new annotations will only be added if the 
19 | #' @param copy        If TRUE (default), the data.table is copied. Otherwise, it is changed by reference. Changing by reference is faster and more memory efficient, but is not predictable R style, so is optional. 
20 | #' @param verbose     If TRUE, report progress (only usefull if multiple queries are given)
21 | #' 
22 | #' @export
23 | #' @return The tokenIndex with the annotation columns
24 | #' @examples
25 | #' ## spacy tokens for: Mary loves John, and Mary was loved by John
26 | #' tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
27 | #' 
28 | #' ## two simple example tqueries
29 | #' passive = tquery(pos = "VERB*", label = "predicate",
30 | #'                  children(relation = c("agent"), label = "subject"))
31 | #' active =  tquery(pos = "VERB*", label = "predicate",
32 | #'                  children(relation = c("nsubj", "nsubjpass"), label = "subject"))
33 | #' 
34 | #' \donttest{ 
35 | #' tokens = annotate_tqueries(tokens, "clause", pas=passive, act=active)
36 | #' tokens
37 | #' if (interactive()) plot_tree(tokens, annotation='clause')
38 | #' }
39 | annotate <- function(tokens, column, ..., block=NULL, fill=TRUE, overwrite=FALSE, block_fill=FALSE, copy=TRUE, verbose=FALSE) {
40 |   .Deprecated('annotate_tqueries')
41 |   annotate_tqueries(tokens=tokens, column=column, ..., block=block, fill=fill, overwrite=overwrite, block_fill=block_fill, copy=copy, verbose=verbose)
42 | }


--------------------------------------------------------------------------------
/man/annotate_tqueries.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotate.r
 3 | \name{annotate_tqueries}
 4 | \alias{annotate_tqueries}
 5 | \title{Annotate a tokenlist based on rsyntax queries}
 6 | \usage{
 7 | annotate_tqueries(
 8 |   tokens,
 9 |   column,
10 |   ...,
11 |   block = NULL,
12 |   fill = TRUE,
13 |   overwrite = NA,
14 |   block_fill = FALSE,
15 |   copy = TRUE,
16 |   verbose = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{tokens}{A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.}
21 | 
22 | \item{column}{The name of the column in which the annotations are added. The unique ids are added as column_id}
23 | 
24 | \item{...}{One or multiple tqueries, or a list of queries, as created with \link{tquery}. Queries can be given a named by using a named argument, which will be used in the annotation_id to keep track of which query was used.}
25 | 
26 | \item{block}{Optionally, specify ids (doc_id - sentence - token_id triples) that are blocked from querying and filling (ignoring the id and recursive searches through the id).}
27 | 
28 | \item{fill}{Logical. If TRUE (default) also assign the fill nodes (as specified in the tquery). Otherwise these are ignored}
29 | 
30 | \item{overwrite}{Applies if column already exists. If TRUE, existing column will be overwritten. If FALSE, the existing annotations in the column will be blocked, and new annotations will be added. This is identical to using multiple queries.}
31 | 
32 | \item{block_fill}{If TRUE (and overwrite is FALSE), the existing fill nodes will also be blocked. In other words, the new annotations will only be added if the}
33 | 
34 | \item{copy}{If TRUE (default), the data.table is copied. Otherwise, it is changed by reference. Changing by reference is faster and more memory efficient, but is not predictable R style, so is optional.}
35 | 
36 | \item{verbose}{If TRUE, report progress (only usefull if multiple queries are given)}
37 | }
38 | \value{
39 | The tokenIndex data.table with the annotation columns added
40 | }
41 | \description{
42 | Apply queries to extract syntax patterns, and add the results as three columns to a tokenlist.
43 | The first column contains the ids for each hit. The second column contains the annotation label. The third column contains the fill level (which you probably won't use, but is important for some functionalities).
44 | Only nodes that are given a name in the tquery (using the 'label' parameter) will be added as annotation.
45 | 
46 | Note that while queries only find 1 node for each labeld component of a pattern (e.g., quote queries have 1 node for "source" and 1 node for "quote"), 
47 | all children of these nodes can be annotated by settting fill to TRUE. If a child has multiple ancestors, only the most direct ancestors are used (see documentation for the fill argument).
48 | }
49 | \examples{
50 | ## spacy tokens for: Mary loves John, and Mary was loved by John
51 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
52 | 
53 | ## two simple example tqueries
54 | passive = tquery(pos = "VERB*", label = "predicate",
55 |                  children(relation = c("agent"), label = "subject"))
56 | active =  tquery(pos = "VERB*", label = "predicate",
57 |                  children(relation = c("nsubj", "nsubjpass"), label = "subject"))
58 | 
59 | tokens = annotate_tqueries(tokens, "clause", pas=passive, act=active)
60 | tokens
61 | \donttest{ 
62 | if (interactive()) plot_tree(tokens, annotation='clause')
63 | }
64 | }
65 | 


--------------------------------------------------------------------------------
/tests/testthat/test_find_nodes.r:
--------------------------------------------------------------------------------
 1 | context("find nodes")
 2 | 
 3 | test_that("find_nodes works", {
 4 |   library(testthat)
 5 |   tokens = as_tokenindex(tokens_dutchquotes)
 6 |   
 7 |   #dat = find_nodes(tokens, label='id', lemma = 'dat')
 8 |   dat = find_nodes(tokens, 
 9 |                     tquery(label='id',lemma='dat', fill=FALSE))
10 |   
11 |   
12 |   expect_equal(nrow(dat), 1)
13 |   expect_equal(dat$token_id, 45)
14 |   
15 |   vcs = find_nodes(tokens, 
16 |                     tquery(label='parent', relation='vc', fill=FALSE))
17 |   expect_equal(nrow(vcs), 3)
18 |   expect_equal(sort(vcs$token_id), c(7,45,54))
19 | 
20 |   body = find_nodes(tokens, 
21 |                      tquery(label='id', relation="vc", fill=FALSE,
22 |                             children(label='id', relation='body', fill=FALSE)))
23 |   expect_equal(nrow(body), 2)
24 | 
25 |   # can we get_children with children?
26 |   children = find_nodes(tokens, 
27 |                          tquery(children(label='child', relation="body", fill=FALSE,
28 |                                          children(label='grandchild', relation='vc', fill=FALSE))))
29 |   expect_equal(nrow(children), 2)
30 |   
31 |   nodes = find_nodes(tokens, 
32 |                       tquery(label='test', relation='su', fill=FALSE,
33 |                              children(label='child', fill=FALSE)))
34 | 
35 |   # get parents
36 |   parents = find_nodes(tokens, 
37 |                         tquery(relation="vc", parents(label='parent', POS = 'verb', fill=FALSE)))
38 | 
39 |   
40 |   expect_equal(nrow(parents), 3)
41 |   expect_equal(parents$token_id, c(6,44,53))
42 |   
43 |   # get parents, grandparents, children and grandchildren
44 |   family = find_nodes(tokens, 
45 |                        tquery(relation='vc', fill=FALSE,
46 |                               parents(label='parent', fill=FALSE,
47 |                                       parents(label='grandparent', fill=FALSE)),
48 |                               children(label='child', relation='obj1', fill=FALSE,
49 |                                        children(label='grandchild', relation='mod', fill=FALSE))))
50 |   expect_equal(nrow(family), 4)
51 |   expect_equal(family$token_id, c(53,45,51,50))
52 |   
53 |   # test using req for optional arguments
54 |   test_req = tokens_corenlp
55 |   nodes1 = find_nodes(test_req, 
56 |                        tquery(POS = 'VB*', label='verb',
57 |                               children(relation = 'nsubj', label='subject'),
58 |                               children(relation = 'dobj', label='object', req=FALSE)))
59 |   nodes = find_nodes(test_req, 
60 |                        tquery(POS = 'VB*', label='verb',
61 |                               children(relation = 'nsubj', label='subject'),
62 |                               children(relation = 'dobj', label='object', req=TRUE)))
63 |   nodes3 = find_nodes(test_req, 
64 |                        tquery(POS = 'VB*', label='verb',
65 |                               children(relation = 'nsubj', label='subject')))
66 |   expect_equal(unique(nodes1$.ID), unique(nodes3$.ID))
67 |   expect_true(length(unique(nodes$.ID))< length(unique(nodes1$.ID)))
68 |   
69 |   
70 |   # test using regex (__R) and perl regex (__P)
71 |   wildcards = find_nodes(test_req, tquery(POS = 'VB*', label='verb'))
72 |   regex = find_nodes(test_req, tquery(POS__R = 'VB.*', label='verb'))
73 |   perl = find_nodes(test_req, tquery(POS__P = 'VB.*', label='verb'))
74 |   expect_equal(wildcards, regex)
75 |   expect_equal(regex, perl)
76 | })
77 | 


--------------------------------------------------------------------------------
/R/syntax_rules.r:
--------------------------------------------------------------------------------
 1 | #' Apply queries created with \link{tquery}
 2 | #'
 3 | #' @param tokens  A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.
 4 | #' @param ...      tqueries, as created with \link{tquery}. Can also be a list with tquery functions. It is recommended to use named arguments/lists, to name the tqueries. 
 5 | #' @param as_chain If TRUE, Nodes that have already been assigned assigned earlier in the chain will be ignored (see 'block' argument). 
 6 | #' @param block    Optionally, specify ids (doc_id - sentence - token_id triples) where find_nodes will stop (ignoring the id and recursive searches through the id). 
 7 | #'                 Can also be a data.table returned by (a previous) apply_queries, in which case all ids are blocked. 
 8 | #' @param check    If TRUE, return a warning if nodes occur in multiple patterns, which could indicate that the find_nodes query is not specific enough.
 9 | #' @param fill     If TRUE (default) the fill nodes are added. Otherwise these are ignored, even if the queries include fill()
10 | #' @param return_wide If TRUE, return nodes in wide format.
11 | #' @param verbose  If TRUE, report progress (only useful if multiple queries are used)
12 | #'
13 | #' @export
14 | #' @return        A data.table in which each row is a node for which all conditions are satisfied, and each column is one of the linked nodes 
15 | #'                (parents / children) with names as specified in the label argument.
16 | #'                
17 | #' @examples 
18 | #' ## spacy tokens for: Mary loves John, and Mary was loved by John
19 | #' tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
20 | #' 
21 | #' ## two simple example tqueries
22 | #' passive = tquery(pos = "VERB*", label = "predicate",
23 | #'                  children(relation = c("agent"), label = "subject"))
24 | #' active =  tquery(pos = "VERB*", label = "predicate",
25 | #'                  children(relation = c("nsubj", "nsubjpass"), label = "subject"))
26 | #'
27 | #' nodes = apply_queries(tokens, pas=passive, act=active)
28 | #' nodes
29 | apply_queries <- function(tokens, ..., as_chain=FALSE, block=NULL, check=FALSE, fill=TRUE, return_wide=FALSE, verbose=FALSE) {
30 |   if (rsyntax_threads() != data.table::getDTthreads()) {
31 |     old_threads = data.table::getDTthreads()
32 |     on.exit(data.table::setDTthreads(old_threads))
33 |     data.table::setDTthreads(rsyntax_threads())
34 |   }
35 |   
36 |   tokens = as_tokenindex(tokens)
37 |   r = list(...)
38 |   
39 |   is_tquery = sapply(r, methods::is, 'tQuery')
40 |   r = c(r[is_tquery], unlist(r[!is_tquery], recursive = FALSE))
41 |   
42 |   out = vector('list', length(r))
43 |   
44 |   if (verbose) message('Applying queries:')
45 |   for (i in 1:length(r)){
46 |     if (!methods::is(r[[i]], 'tQuery')) next
47 |     .TQUERY_NAME = names(r)[i]
48 |     if (verbose) cat(paste0('\t', .TQUERY_NAME, '\n'))
49 |     if (is.null(.TQUERY_NAME)) .TQUERY_NAME = ''
50 |     if (grepl(',', .TQUERY_NAME)) stop('tquery name cannot contain a comma')
51 |     .TQUERY_NAME = ifelse(.TQUERY_NAME == '', NA, as.character(.TQUERY_NAME))
52 | 
53 |     nodes = find_nodes(tokens, r[[i]], block=block, name=.TQUERY_NAME, fill=FALSE, melt = FALSE)
54 |     
55 |     if (!is.null(nodes)) {
56 |       if (as_chain) block = get_long_ids(block, nodes)
57 |       out[[i]] = nodes  
58 |     }
59 |   }
60 |   
61 |   
62 |   if (fill && verbose) message('Adding fill nodes:')
63 |   for (i in 1:length(r)) {
64 |     if (is.null(out[[i]])) next
65 |     if (fill) {
66 |       if (verbose) cat(paste0('\t', names(r)[i], '\n'))
67 |       out[[i]] = add_fill(tokens, out[[i]], r[[i]], block=block)
68 |     }
69 |     out[[i]] = if (return_wide) out[[i]] else melt_nodes_list(out[[i]])
70 |   }
71 |   
72 |   nodes = data.table::rbindlist(out, fill=TRUE)
73 |   #if (chain && !chain_fill && '.FILL_LEVEL' %in% colnames(d)) {
74 |   #  data.table::setorder(d, '.FILL_LEVEL') 
75 |   #  d = 
76 |   #}
77 |   class(nodes) = if (return_wide) c('rsyntaxNodesWide', class(nodes)) else c('rsyntaxNodes', class(nodes))
78 |   nodes
79 | } 
80 | 


--------------------------------------------------------------------------------
/R/melt_nodes.r:
--------------------------------------------------------------------------------
 1 | melt_nodes_list <- function(nodes, fill_only_first=TRUE) {
 2 |   .ROLE = NULL; token_id = NULL; .FILL_LEVEL = NULL
 3 |   ## the nodes created in find_nodes have a weird structure, which is only useful for internal use
 4 |   ## here we melt the nodes to a more convenient format
 5 |   
 6 |   for (.RM in grep('^BLOCK', colnames(nodes), value=TRUE)) 
 7 |     nodes[, (.RM) := NULL]
 8 |   
 9 |   if (nrow(nodes) == 0 || ncol(nodes) <= 3) return(data.table::data.table())
10 |   nodes_list = list()
11 |   fill_cols = grep('_FILL', colnames(nodes), fixed=TRUE, value=TRUE)
12 |   
13 |   cols = setdiff(colnames(nodes), c('doc_id','sentence','.ID'))
14 |   cols = setdiff(cols, grep('\\_LEVEL$', cols, value=TRUE))
15 |   level = paste0(cols, '_LEVEL')
16 |   
17 |   missing = setdiff(c(level), colnames(nodes))
18 |   nodes[, (missing) := double()]
19 |   
20 |   nodes = melt(nodes, id.vars=c('doc_id','sentence','.ID'), 
21 |                       measure.vars=list(cols,level), 
22 |                       value.name=c('token_id','.FILL_LEVEL'),
23 |                       variable.name='.ROLE', variable.factor=TRUE)
24 |   nodes[, .ROLE := factor(.ROLE, labels=cols)]
25 |   nodes = subset(nodes, !is.na(token_id))
26 |   
27 |   ## remove duplicate label name tags (#2, etc)
28 |   data.table::setattr(nodes$.ROLE, 'levels', gsub('#.*', '', levels(nodes$.ROLE)))
29 |   
30 |   ## fill should be 0 if not fill (NA), and _FILL should be removed from .ROLE 
31 |   nodes[is.na(nodes$.FILL_LEVEL), .FILL_LEVEL := 0]
32 |   data.table::setattr(nodes$.ROLE, 'levels', gsub('\\_FILL', '', levels(nodes$.ROLE)))
33 | 
34 |   if (fill_only_first) {
35 |     data.table::setorder(nodes, '.FILL_LEVEL')
36 |     nodes = unique(nodes, by=c('doc_id','sentence','token_id'))
37 |   } else {
38 |     nodes = unique(nodes, by=c('doc_id','sentence','token_id','.ROLE','.ID'))
39 |   }
40 |   nodes
41 | }
42 | 
43 | #' Get ids in various forms to extract token_ids
44 | #'
45 | #' @param ... Either a data.table with the columns doc_id, sentence and token_id, or the output of \link{apply_queries}
46 | #' @param select    If not null, a character vector for selecting column names
47 | #' @param with_fill If TRUE, include the ids of the fill nodes
48 | #'
49 | #' @return A data.table with the columns doc_id, sentence and token_id
50 | #' @export
51 | get_long_ids <- function(..., select=NULL, with_fill=FALSE) {
52 |   l = list(...)
53 |   
54 |   len = length(l)
55 |   out = vector('list', len)
56 |   for (i in 1:len) {
57 |     d = l[[i]]
58 |     if (is.null(d)) next
59 |     if (methods::is(d, 'data.table')) {
60 |       if (!'token_id' %in% colnames(d)) {
61 |         if (!is.null(select)) {
62 |           select = setdiff(select, '.TQUERY')
63 |           d = subset(d, select = colnames(d) %in% union(c('doc_id', 'sentence'), select))
64 |         } else {
65 |           d = subset(d, select = colnames(d) %in% setdiff(colnames(d), '.TQUERY'))
66 |         }
67 |         
68 |         if (with_fill) {
69 |           ignore_vars = grep('\\FILL\\_LEVEL', colnames(d), value=TRUE)  ## exclude only level vars from measure vars
70 |         } else {
71 |           ignore_vars = grep('\\_FILL', colnames(d), value=TRUE)   ## also exclude fill vars from measure vars
72 |         }
73 |         ignore_vars = union(ignore_vars, grep('\\_PARENT', colnames(d), value=TRUE))
74 | 
75 |         d = melt(d, id.vars = c('doc_id', 'sentence'), 
76 |                     measure.vars=setdiff(colnames(d), c('doc_id','sentence','.ID',ignore_vars)),
77 |                     variable.name = '.VARIABLE', value.name='token_id')
78 |       } else {
79 |         if (!with_fill && '.FILL_LEVEL' %in% colnames(d)) 
80 |           d = d[d$.FILL_LEVEL == 0,,drop=FALSE]
81 |       } 
82 |       if (!'token_id' %in% colnames(d)) next
83 |       out[[i]] = d[,c('doc_id','sentence','token_id')]
84 |       next
85 |     }
86 |     if (methods::is(d, 'list')) {
87 |       out[[i]] = get_long_ids(d)
88 |     }
89 |     stop('Not a valid input for get_long_ids')
90 |   }
91 |   out = unique(data.table::rbindlist(out))
92 |   if (ncol(out) == 0) NULL else out
93 | }


--------------------------------------------------------------------------------
/R/print_functions.r:
--------------------------------------------------------------------------------
  1 | pipe_tr <- function() intToUtf8(9492)
  2 | pipe_tb <- function() intToUtf8(9474)
  3 | pipe_trb <- function() intToUtf8(9500)
  4 | 
  5 | abbrev_str <- function(string, maxlen) {
  6 |   if (nchar(string) > maxlen) string = paste(stringi::stri_sub(string, 0, maxlen-3), '...', sep='')
  7 |   string
  8 | }
  9 | 
 10 | recprint <- function(x, pd, level=1, connector=pipe_tr(), pipe_level=c(), max_char=getOption('tQuery_print_max_char', default=30), ...) {
 11 |   #cat(level, ': ', sep='')
 12 |   if (level > 0) {
 13 |     type = if('level' %in% names(x)) ifelse(x$level == 'children', ' c', ' p') else '  n'
 14 |     
 15 |     level_space = rep('  ', level-1)
 16 |     level_space[pipe_level] = sprintf('%s ', pipe_tb())
 17 |     text = paste(paste(level_space, collapse=''), connector, type, ' ', sep='')
 18 |     if (nchar(text) < pd[1]) text = paste(text, paste(rep(' ', pd[1] - nchar(text)), collapse=''), sep='')
 19 |     cat(text, sep='')
 20 |     
 21 |     if (!is.na(x$label))
 22 |       cat('  ', x$label, rep(' ', pd[2] - nchar(x$label)), '  ', sep='') else cat('  ', rep(' ', pd[2]), '  ', sep='')
 23 | 
 24 |     first = TRUE    
 25 |     if ('NOT' %in% names(x) && x$NOT) {
 26 |       if (!first) cat(', ') else cat(' ')
 27 |       cat('NOT=T')
 28 |       first = FALSE
 29 |     }
 30 |     if ('req' %in% names(x) && !x$req) {
 31 |       if (!first) cat(', ') else cat(' ')
 32 |       cat('req=F')
 33 |       first = FALSE
 34 |     }  
 35 |     #if (!x$select == 'NULL') {
 36 |     #  if (!first) cat(', ') else cat(' ')
 37 |     #  cat('select=', abbrev_str(x$select, max_char))
 38 |     #  first = FALSE
 39 |     #}  
 40 |     if ('depth' %in% names(x) && x$depth > 1) {
 41 |       if (!first) cat(', ') else cat(' ')
 42 |       cat('depth=', x$depth, sep='')
 43 |       first = FALSE
 44 |     }  
 45 |     
 46 |     l = x$lookup
 47 |     rec_lookup_print(l, first, max_char)
 48 |     cat('\n', sep='')
 49 |   }
 50 |   for (i in seq_along(x$nested)) {
 51 |     pipe_level = if (level < 2) pipe_level else c(pipe_level, level)
 52 |     if (i == length(x$nested))
 53 |       recprint(x$nested[[i]], pd, level+1, pipe_tr(), pipe_level=pipe_level, max_char=max_char)
 54 |     else
 55 |       recprint(x$nested[[i]], pd, level+1, pipe_trb(), pipe_level=pipe_level, max_char=max_char)
 56 |   }
 57 | }
 58 | 
 59 | rec_lookup_print <- function(l, first, max_char, op = 'AND', level=1) {
 60 |   if (op %in% c('OR','NOT')) {
 61 |     if (level > 1) cat(', ') else cat(' ')
 62 |     cat(op, '(', sep='')
 63 |   }
 64 |   for (i in seq_along(l)) {
 65 |     if (is.null(l[[i]])) next
 66 |     if (methods::is(l[[i]], 'tokenLookup')) {
 67 |       rec_lookup_print(l[[i]]$lookup, first=TRUE, max_char, l[[i]]$boolean, level=level+1)
 68 |       if (!first) cat(', ') else cat(' ')
 69 |       first = FALSE
 70 |       next
 71 |     }
 72 |     n = names(l)[[i]]
 73 |     v = if (class(l[[n]]) %in% c('factor','character')) paste0(l[[n]]) else l[[n]]
 74 |     if (length(v) > 1) v = paste0('(', abbrev_str(paste(v, collapse=','), max_char), ')')
 75 |     if (!first) cat(', ') else if (level==1) cat(' ')
 76 |     first = FALSE
 77 |     cat(n, '=', v, sep='')
 78 |   }
 79 |   if (op %in% c('OR','NOT')) cat(')')
 80 | }
 81 | 
 82 | get_print_data <- function(x, d=c(0,0)) {
 83 |   d[2] = max(d[2], nchar(x$label), na.rm = TRUE)
 84 |   for (i in seq_along(x$nested)) {
 85 |     d = get_print_data(x$nested[[i]], c(d[1]+1, d[2]))
 86 |   }
 87 |   d
 88 | }
 89 | 
 90 | #' S3 print for tQuery class
 91 | #'
 92 | #' @param x a tQuery
 93 | #' @param ... not used
 94 | #'
 95 | #' @method print tQuery
 96 | #' @examples
 97 | #' q = tquery(label='quote',
 98 | #'            children(relation='nmod:according_to', label='source',
 99 | #'                     children(label='verb')))
100 | #' q 
101 | #' @export
102 | print.tQuery <- function(x, ...) {
103 |   pd = get_print_data(x, c(0,10))
104 |   pd[1] = (pd[1]*3)
105 |   if (pd[1] < 12) pd[1] = 10
106 |   #cat('LEVEL ', rep(' ', pd[1]-3), '  NAME', rep(' ', pd[2]-4), '   FILTER\n', sep='')
107 |   #if (!is.na(x$label) &! x$label == '') cat(x$label, '\n', sep = '') else cat('...', sep='')
108 |   recprint(x, pd, connector='', ...)
109 | }
110 | 


--------------------------------------------------------------------------------
/man/custom_fill.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tquery.r
 3 | \name{custom_fill}
 4 | \alias{custom_fill}
 5 | \title{Specify custom fill behavior}
 6 | \usage{
 7 | custom_fill(
 8 |   ...,
 9 |   g_id = NULL,
10 |   depth = Inf,
11 |   connected = FALSE,
12 |   max_window = c(Inf, Inf),
13 |   min_window = c(0, 0)
14 | )
15 | }
16 | \arguments{
17 | \item{...}{Accepts two types of arguments: name-value pairs for finding nodes (i.e. rows), and functions to look for parents/children of these nodes.
18 | 
19 | The name in the name-value pairs need to match a column in the data.table, and the value needs to be a vector of the same data type as the column.
20 | By default, search uses case sensitive matching, with the option of using common wildcards (* for any number of characters, and ? for a single character).
21 | Alternatively, flags can be used to to change this behavior to 'fixed' (__F), 'igoring case' (__I) or 'regex' (__R). See details for more information. 
22 | 
23 | If multiple name-value pairs are given, they are considered as AND statements, but see details for syntax on using OR statements, and combinations.
24 | 
25 | To look for parents and children of the nodes that are found, you can use the \link{parents} and \link{children} functions as (named or unnamed) arguments. 
26 | These functions have the same query arguments as tquery, but with some additional arguments.}
27 | 
28 | \item{g_id}{Find nodes by global id, which is the combination of the doc_id, sentence and token_id. Passed as a data.frame or data.table with 3 columns: (1) doc_id, (2) sentence and (3) token_id.}
29 | 
30 | \item{depth}{A positive integer, determining how deep parents/children are sought. 1 
31 | means that only direct parents and children of the node are retrieved. 2 means children and grandchildren, etc.
32 | All parents/children must meet the filtering conditions (... or g_id)}
33 | 
34 | \item{connected}{Controls behavior if depth > 1 and filters are used. If FALSE, all parents/children to the given depth are retrieved, and then filtered. 
35 | This way, grandchildren that satisfy the filter conditions are retrieved even if their parents do not satisfy the conditions.
36 | If TRUE, the filter is applied at each level of depth, so that only fully connected branches of nodes that satisfy the conditions are retrieved.}
37 | 
38 | \item{max_window}{Set the max token distance of the children/parents to the node. Has to be either a numerical vector of length 1 for distance in both directions, or a 
39 | vector of length 2, where the first value is the max distance to the left, and the second value the max distance to the right. Default is c(Inf, Inf) meaning that no max distance is used.}
40 | 
41 | \item{min_window}{Like max_window, but for the min distance. Default is c(0,0) meaning that no min is used.}
42 | }
43 | \value{
44 | Should not be used outside of \link{tquery}
45 | }
46 | \description{
47 | If a tquery(), parents() or children() function has set a label, all children of the matched node (that are not matched by another query) will also be given this label.
48 | This is called the 'fill' heuristic.
49 | The custom_fill() function can be used to give more specific conditions for which children need to be labeled.
50 | 
51 | The function can be used almost identically to the children() function. The specification of the look-up conditions works in the same way.
52 | NOTE that custom_fill, just like the children() function, should be passed as an unnamed argument, and NOT to the 'fill' argument 
53 | (which is the boolean argument for whether fill should be used)
54 | 
55 | For the custom_fill function, the special BREAK() look-up function is particularly powerful.
56 | custom_fill will recursively search for children, children of children, etc.
57 | The look-up conditions in custom_fill determine which of all these direct and indirect children to label.
58 | Often, however, you would want to the recursive loop to 'break' when certain conditions are met.
59 | For instance, to ignore children in a relative clause: custom_fill(BREAK(relation = 'relcl'))
60 | }
61 | \examples{
62 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
63 | 
64 | ## custom fill rule that ignores relative clauses
65 | no_relcl_fill = custom_fill(BREAK(relation='relcl'))
66 | 
67 | ## add custom fill as argument in children(). NOTE that it should be
68 | ## passed as an unnamed argument (and not to the fill boolean argument)
69 | tq = tquery(label = 'verb', pos='VERB', fill=FALSE,
70 |          children(label = 'subject', relation = 'nsubj', no_relcl_fill),
71 |          children(label = 'object', relation = 'dobj', no_relcl_fill))
72 |          
73 | tokens = annotate_tqueries(tokens, "clause", tq)
74 | tokens
75 | }
76 | 


--------------------------------------------------------------------------------
/R/filter_tokens.r:
--------------------------------------------------------------------------------
  1 | 
  2 | filter_tokens <- function(tokens, lookup=list(), .G_ID=NULL, .G_PARENT=NULL, .BLOCK=NULL, use_index=TRUE) {
  3 |   ## we need the ridiculous .UPPERCASE because if the name happens to be a column in data.table it messes up (it will use its own column for the binary search)
  4 |   .G_ID = unique(.G_ID)
  5 |   .G_PARENT = unique(.G_PARENT)
  6 |   
  7 |   i = NULL
  8 |   null_intersect <- function(x, y) if (is.null(x)) y else intersect(x,y) 
  9 |   if (!is.null(.G_ID)) i = null_intersect(i, tokens[list(.G_ID[[1]], .G_ID[[2]], .G_ID[[3]]), on=c('doc_id','sentence','token_id'), which=TRUE])
 10 |   if (!is.null(.G_PARENT)) i = null_intersect(i, tokens[list(.G_PARENT[[1]], .G_PARENT[[2]], .G_PARENT[[3]]), on=c('doc_id','sentence','parent'), which=TRUE])
 11 |   .BLOCK = get_long_ids(.BLOCK)
 12 |   if (!is.null(.BLOCK)) i = null_intersect(i, tokens[!list(.BLOCK[[1]], .BLOCK[[2]], .BLOCK[[3]]), on=c('doc_id','sentence','token_id'), which=TRUE])
 13 |   if (!is.null(i)) {
 14 |     i = stats::na.omit(i)
 15 |     tokens = tokens[as.numeric(i),]  
 16 |   }
 17 |   
 18 |   i = lookup_tokens(tokens, lookup, use_index=use_index)
 19 |   if (!is.null(i)) {
 20 |     i = stats::na.omit(i)
 21 |     tokens = tokens[as.numeric(i),]  
 22 |   }
 23 |   
 24 |   #if (!select == 'NULL' & !is.null(select)) tokens = tokens[eval(parse(text=select), tokens, e),]
 25 |   tokens
 26 | }
 27 | 
 28 | lookup_tokens <- function(tokens, lookup=list(), boolean='AND', use_index=TRUE) {
 29 |   i = NULL
 30 |   for (lookup_i in seq_along(lookup)) {
 31 |     .N = names(lookup)[lookup_i]
 32 |     .V = lookup[[lookup_i]]
 33 |     if (is.null(.V)) next
 34 |     
 35 |     if (methods::is(.V, 'tokenLookup') | methods::is(.V, 'tokenLookupBreak')) {
 36 |       result = lookup_tokens(tokens, .V$lookup, boolean=.V$boolean)
 37 |       if (is.null(result)) next
 38 |     } else {
 39 |       .COLNAME = gsub('__.*', '', .N)
 40 |       if (!.COLNAME %in% colnames(tokens)) stop(sprintf('%s is not a valid column name in tokens', .N))
 41 |       if (use_index) if (!.N %in% data.table::indices(tokens)) data.table::setindexv(tokens, .COLNAME)
 42 | 
 43 |       .V = prepare_terms(.V, tokens[[.COLNAME]], 
 44 |                          ignore_case = grepl('__N?R?F?P?I', .N), 
 45 |                          regex = grepl('__N?I?F?P?R', .N),
 46 |                          fixed = grepl('__N?R?I?P?F', .N),
 47 |                          perl = grepl('__N?I?F?P?P', .N))
 48 |       result = tokens[list(.V), on=(.COLNAME), which=TRUE, nomatch=0, allow.cartesian=TRUE]
 49 |     }
 50 |     if (is.null(i)) {
 51 |       if (boolean == 'NOT') result = if (length(result) > 0) (1:nrow(tokens))[-result] else 1:nrow(tokens)
 52 |       i = result
 53 |     } else {
 54 |       if (boolean == 'NOT') i = setdiff(i, result)
 55 |       if (boolean == 'AND') i = intersect(i, result)
 56 |       if (boolean == 'OR') i = union(i, result)
 57 |     }
 58 |   }
 59 |   i
 60 | }
 61 | 
 62 | get_full_terms <- function(x, terms, batchsize=25, ignore_case=TRUE, perl=F) {
 63 |   terms = if (methods::is(terms, 'factor')) levels(terms) else unique(terms)
 64 |   if (length(x) > 1) { ## if there are multiple terms, make batches of terms and turn each batch into a single regex
 65 |     x = split(as.character(x), ceiling(seq_along(x)/batchsize))
 66 |     x = sapply(x, stringi::stri_paste, collapse='|')
 67 |     out = rep(FALSE, length(x))
 68 |     for(xbatch in x){
 69 |       out = out | grepl(xbatch, terms, ignore.case=ignore_case, perl = perl)
 70 |     }
 71 |   } else {
 72 |     out = grepl(as.character(x), terms, ignore.case=ignore_case, perl = perl)
 73 |   }
 74 |    
 75 |   terms[out]
 76 | }
 77 | 
 78 | search_term_regex <- function(patterns) {
 79 |   patterns = gsub("([^0-9a-zA-Z])", '\\\\\\1', x=patterns)  # escape special characters
 80 |   patterns = gsub('\\\\(\\*)|\\\\(\\?)', '.\\1', patterns)  # process wildcards
 81 |   paste0('\\b',patterns,'\\b')                              # set word boundaries
 82 | }
 83 | 
 84 | prepare_terms <- function(x, terms, ignore_case=TRUE, regex=FALSE, fixed=FALSE, perl_regex=FALSE) {
 85 |   if (ignore_case && fixed) warning('ignore_case (__I) is not used, because fixed (__F) is also used')
 86 |   if (perl_regex) {
 87 |     perl = TRUE
 88 |     regex = TRUE
 89 |   } else {
 90 |     perl = FALSE
 91 |   }
 92 |   if (regex && fixed) warning('regex (__R) is not used, because fixed (__F) is also used')
 93 |   if (fixed) return(x)
 94 |   
 95 |   if (regex) {
 96 |     return(get_full_terms(x, terms, ignore_case = ignore_case, perl=perl)) 
 97 |   } else {
 98 |     if (ignore_case) {
 99 |       x = search_term_regex(x)
100 |       return(get_full_terms(x, terms, ignore_case = TRUE)) 
101 |     } else {
102 |       has_wildcard = grepl('[*?]', x)
103 |       if (!any(has_wildcard)) return(x)
104 |       x_full = get_full_terms(search_term_regex(x[has_wildcard]), terms, ignore_case=FALSE)
105 |       x = c(x[!has_wildcard], x_full)
106 |       return(unique(x))
107 |     }
108 |   }
109 | }
110 | 
111 | 


--------------------------------------------------------------------------------
/man/plot_tree.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/plot_tree.r
  3 | \name{plot_tree}
  4 | \alias{plot_tree}
  5 | \title{Create an igraph tree from a sentence}
  6 | \usage{
  7 | plot_tree(
  8 |   tokens,
  9 |   ...,
 10 |   sentence_i = 1,
 11 |   doc_id = NULL,
 12 |   sentence = NULL,
 13 |   annotation = NULL,
 14 |   only_annotation = FALSE,
 15 |   pdf_file = NULL,
 16 |   allign_text = TRUE,
 17 |   ignore_rel = NULL,
 18 |   all_lower = FALSE,
 19 |   all_abbrev = NULL,
 20 |   textsize = 1,
 21 |   spacing = 1,
 22 |   use_color = TRUE,
 23 |   max_curve = 0.3,
 24 |   palette = grDevices::terrain.colors,
 25 |   rel_on_edge = F,
 26 |   pdf_viewer = FALSE,
 27 |   viewer_mode = TRUE,
 28 |   viewer_size = c(100, 100)
 29 | )
 30 | }
 31 | \arguments{
 32 | \item{tokens}{A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}. Can also be a corpustools tCorpus.}
 33 | 
 34 | \item{...}{Optionally, select which columns to include as labels and how to present them. Can be quoted or unquoted names and expressions, using columns in the tokenIndex. For example, plot_tree(tokens, token, pos) will use the $token and $pos columns in tokens. You can also use expressions for easy controll of visulizations. For example: plot_tree(tokens, tolower(token), abbreviate(pos,1)). (note that abbreviate() is really usefull here)}
 35 | 
 36 | \item{sentence_i}{By default, plot_tree uses the first sentence (sentence_i = 1) in the data. sentence_i can be changed to select other sentences by position (the i-th unique sentence in the data). Note that sentence_i does not refer to the values in the sentence column (for this use the sentence argument together with doc_id)}
 37 | 
 38 | \item{doc_id}{Optionally, the document id can be specified. If so, sentence_i refers to the i-th sentence within the given document.}
 39 | 
 40 | \item{sentence}{Optionally, the sentence id can be specified (note that sentence_i refers to the position). If sentence is given, doc_id has to be given as well.}
 41 | 
 42 | \item{annotation}{Optionally, a column with an rsyntax annotation, to add boxes around the annotated nodes.}
 43 | 
 44 | \item{only_annotation}{If annotation is given, only_annotation = TRUE will print only the nodes with annotations.}
 45 | 
 46 | \item{pdf_file}{Directly save the plot as a pdf file}
 47 | 
 48 | \item{allign_text}{If TRUE (default) allign text (the columns specified in ...) in a single horizontal line at the bottom, instead of following the different levels in the tree}
 49 | 
 50 | \item{ignore_rel}{Optionally, a character vector with relation names that will not be shown in the tree}
 51 | 
 52 | \item{all_lower}{If TRUE, make all text lowercase}
 53 | 
 54 | \item{all_abbrev}{If an integer, abbreviate all text, with the number being the target number of characters.}
 55 | 
 56 | \item{textsize}{A number to manually change the textsize. The function tries to set a suitable textsize for the plotting device, but if this goes wrong and now everything is broken and sad, you can multiply the textsize with the given number.}
 57 | 
 58 | \item{spacing}{A number for scaling the distance between words (between 0 and infinity)}
 59 | 
 60 | \item{use_color}{If true, use colors}
 61 | 
 62 | \item{max_curve}{A number for controlling the allowed amount of curve in the edges.}
 63 | 
 64 | \item{palette}{A function for creating a vector of n contiguous colors. See ?terrain.colors for standard functions and documentation}
 65 | 
 66 | \item{rel_on_edge}{If TRUE, print relation label on edge instead of above the node}
 67 | 
 68 | \item{pdf_viewer}{If TRUE, view the plot as a pdf. If no pdf_file is specified, the pdf will be saved to the temp folder}
 69 | 
 70 | \item{viewer_mode}{By default, the plot is saved as a PNG embedded in a HTML and opened in the viewer. This hack makes it independent of the 
 71 | size of the plotting device and enables scrolling. By setting viewer_mode to False, the current plotting device is used.}
 72 | 
 73 | \item{viewer_size}{A vector of length 2, that multiplies the width (first value) and height (second value) of the viewer_mode PNG}
 74 | }
 75 | \value{
 76 | plots a dependency tree.
 77 | }
 78 | \description{
 79 | Create an igraph tree from a token_index (\link{as_tokenindex}) or a data.frame that can be coerced to a tokenindex.
 80 | 
 81 | By default, all columns in the data are included as labels. This can be changes by using the ... argument.
 82 | }
 83 | \examples{
 84 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
 85 | 
 86 | \donttest{
 87 | if (interactive()) plot_tree(tokens, token, pos)
 88 | 
 89 | ## plot with annotations
 90 | direct = tquery(label = 'verb', pos = 'VERB', fill=FALSE,
 91 |                 children(label = 'subject', relation = 'nsubj'),
 92 |                 children(label = 'object', relation = 'dobj'))
 93 | passive = tquery(label = 'verb', pos = 'VERB', fill=FALSE,
 94 |                  children(label = 'subject', relation = 'agent'),
 95 |                  children(label = 'object', relation = 'nsubjpass'))
 96 |  
 97 | if (interactive()) {                
 98 | tokens \%>\%
 99 |    annotate_tqueries('clause', pas=passive, dir=direct) \%>\%
100 |    plot_tree(token, pos, annotation='clause')
101 | }
102 | }
103 | }
104 | 


--------------------------------------------------------------------------------
/man/tquery.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tquery.r
 3 | \name{tquery}
 4 | \alias{tquery}
 5 | \title{Create a query for dependency based parse trees in a data.table (CoNLL-U or similar format).}
 6 | \usage{
 7 | tquery(..., g_id = NULL, label = NA, fill = TRUE, block = FALSE)
 8 | }
 9 | \arguments{
10 | \item{...}{Accepts two types of arguments: name-value pairs for finding nodes (i.e. rows), and functions to look for parents/children of these nodes.
11 | 
12 | The name in the name-value pairs need to match a column in the data.table, and the value needs to be a vector of the same data type as the column.
13 | By default, search uses case sensitive matching, with the option of using common wildcards (* for any number of characters, and ? for a single character).
14 | Alternatively, flags can be used to to change this behavior to 'fixed' (__F), 'igoring case' (__I) or 'regex' (__R). See details for more information. 
15 | 
16 | If multiple name-value pairs are given, they are considered as AND statements, but see details for syntax on using OR statements, and combinations.
17 | 
18 | To look for parents and children of the nodes that are found, you can use the \link{parents} and \link{children} functions as (named or unnamed) arguments. 
19 | These functions have the same query arguments as tquery, but with some additional arguments.}
20 | 
21 | \item{g_id}{Find nodes by global id, which is the combination of the doc_id, sentence and token_id. Passed as a data.frame or data.table with 3 columns: (1) doc_id, (2) sentence and (3) token_id.}
22 | 
23 | \item{label}{A character vector, specifying the column name under which the selected tokens are returned. 
24 | If NA, the column is not returned.}
25 | 
26 | \item{fill}{Logical. If TRUE (default), the default custom_fill() will be used. To more specifically control fill, you can nest the \link{custom_fill} 
27 | function (a special version of the children function).}
28 | 
29 | \item{block}{Logical. If TRUE, the node will be blocked from being assigned (labeled). This is mainly useful if you have a node that you do not want to be assigned by fill,
30 | but also don't want to 'label' it. Essentially, block is shorthand for using label and then removing the node afterwards. If block is TRUE, label has to be NA.}
31 | }
32 | \value{
33 | A tQuery object, that can be used with the \link{apply_queries} function.
34 | }
35 | \description{
36 | To find nodes you can use named arguments, where the names are column names (in the data.table on which the
37 | queries will be used) and the values are vectors with look-up values. 
38 | 
39 | Children or parents of nodes can be queried by passing the \link{children} or \link{parents} function as (named or unnamed) arguments.
40 | These functions use the same query format as the tquery function, and children and parents can be nested recursively to find children of children etc. 
41 | 
42 | The custom_fill() function (also see fill argument) can be nested to customize which children of a 'labeled' node need to be matched. It can only be nested in a query if the label argument is not NULL,
43 | and by default will include all children of the node that have not been assigned to another node. If two nodes have a shared child, the child will be
44 | assigned to the closest node. 
45 |   
46 | Please look at the examples below for a recommended syntactic style for using the find_nodes function and these nested functions.
47 | }
48 | \details{
49 | Multiple values in a name-value pair operate as OR conditions.
50 | For example, tquery(relation = c('nsubj','dobj')) means that the relation column should have the value 'nsubj' OR 'dobj'. 
51 | 
52 | If multiple named arguments are given they operate as AND conditions. 
53 | For example, tquery(relation = 'nsubj', pos = 'PROPN') means that the relation should be 'nsubj' AND the pos should be 'PROPN'.
54 | 
55 | This easily combines for the most common use case, which is to select on multiple conditions (relation AND pos), but allowing different (similar) values ('PROPN' OR 'NOUN').
56 | For example: tquery(relation = 'nsubj', pos = c('PROPN','NOUN')) means that the node should have the 'nsubj' relation, but pos can be either 'PROPN' or 'NOUN'.
57 | 
58 | For more specific behavior, the AND(), OR() and NOT() functions can be used for boolean style conditions.
59 | 
60 | There are several flags that can be used to change search condition. To specify flags, add a double underscore and the flag character to the name in the name value pairs (...).
61 | By adding the suffix __R, query terms are considered to be regular expressions, and the suffix __I uses case insensitive search (for normal or regex search).
62 | If the suffix __F is used, only exact matches are valid (case sensitive, and no wildcards).
63 | Multiple flags can be combined, such as lemma__RI, or lemma_IR  (order of flags is irrelevant)
64 | }
65 | \examples{
66 | ## it is convenient to first prepare vectors with relevant words/pos-tags/relations
67 | .SAY_VERBS = c("tell", "show","say", "speak") ## etc.
68 | .QUOTE_RELS=  c("ccomp", "dep", "parataxis", "dobj", "nsubjpass", "advcl")
69 | .SUBJECT_RELS = c('su', 'nsubj', 'agent', 'nmod:agent') 
70 | 
71 | quotes_direct = tquery(lemma = .SAY_VERBS,
72 |                          children(label = 'source', p_rel = .SUBJECT_RELS),
73 |                          children(label = 'quote', p_rel = .QUOTE_RELS))
74 | quotes_direct 
75 | }
76 | 


--------------------------------------------------------------------------------
/R/tokenbrowser.r:
--------------------------------------------------------------------------------
  1 | #' Create a full text browser with highlighted rsyntax annotations
  2 | #'
  3 | #' @param tokens      A tokenIndex
  4 | #' @param annotation  The name of the column that contains the rsyntax annotation
  5 | #' @param value       Optionally, a character vector with values in annotation. If used, only these values are fully colored, and the other (non NA) values only have border colors.
  6 | #' @param value2      Optionally, a character vector with values in annotation other than those specified in 'value'. If used, only these values have border colors.
  7 | #' @param meta        Optionally, a data.frame with document meta data. Has to have a column named doc_id of which the values match with the doc_id column in tokens 
  8 | #' @param token_col   The name of the column in tokens with the token text
  9 | #' @param filename    Optionally, a filename to directly save the file. If not specified, a temporary file is created
 10 | #' @param view        If TRUE, the browser will immediatly be viewed in the viewer panel
 11 | #' @param random_seed If a number is given, it is used as a seed to randomize the order of documents. This is usefull for
 12 | #'                    validations purposes, because the doc_id in the tokenindex is sorted.
 13 | #' @param ...         Arguments passed to \link[tokenbrowser]{create_browser}
 14 | #'
 15 | #' @return  The url for the file
 16 | #' @export
 17 | #'
 18 | #' @examples
 19 | #' tokens = tokens_spacy
 20 | #' 
 21 | #' ## two simple example tqueries
 22 | #' passive = tquery(pos = "VERB*", label = "predicate",
 23 | #'                 children(relation = c("agent"), label = "subject"))
 24 | #' active =  tquery(pos = "VERB*", label = "predicate",
 25 | #'                 children(relation = c("nsubj", "nsubjpass"), label = "subject"))
 26 | #' 
 27 | #' 
 28 | #' \donttest{
 29 | #' tokens = annotate_tqueries(tokens, 'clause', pas=passive, act=active)
 30 | #' syntax_reader(tokens, annotation = 'clause', value = 'subject')
 31 | #' }
 32 | syntax_reader <- function(tokens, annotation, value=NULL, value2=NULL, meta=NULL, token_col='token', filename=NULL, view=TRUE, random_seed=NA, ...){
 33 |   #if (!methods::is(tokens, 'tokenIndex')) stop('tokens has to be a tokenIndex')
 34 |   if (!is.na(random_seed)) {
 35 |     tokens = data.table::copy(tokens)
 36 |     doc_ids = unique(tokens$doc_id)
 37 |     set.seed(random_seed)
 38 |     doc_ids = doc_ids[sample(1:length(doc_ids))]
 39 |     tokens = tokens[doc_ids, on=c('doc_id')]
 40 |   }
 41 |   
 42 |   #tokens = as_tokenindex(tokens)
 43 | 
 44 |   ann_id = paste0(annotation, '_id')
 45 |   id = match(tokens[[ann_id]], unique(tokens[[ann_id]]))
 46 |   
 47 |   if (is.null(value)) {
 48 |     if (!is.null(value2)) stop("value2 can only be used (not-NULL) if value is used")
 49 |     value = unique(as.character(tokens[[annotation]]))
 50 |   }
 51 |   value = ifelse(!is.na(tokens[[annotation]]) & tokens[[annotation]] %in% value, id, NA)
 52 |   
 53 |   if (is.null(value2)) {
 54 |     value2 = ifelse(!is.na(tokens[[annotation]]) & is.na(value), id, NA)
 55 |   } else {
 56 |     value2 = ifelse(!is.na(tokens[[annotation]]) & tokens[[annotation]] %in% value2, id, NA)
 57 |   }
 58 |   
 59 |   value_label = paste(stats::na.omit(unique(as.character(tokens[[annotation]][value]))), collapse=', ')
 60 |   value2_label = paste(stats::na.omit(unique(as.character(tokens[[annotation]][value2]))), collapse=', ')
 61 |   
 62 |   tokens[[token_col]] = syntax_highlight_tokens(tokens$doc_id, tokens[[token_col]], tokens[[ann_id]], value, value2, value_label, value2_label)
 63 | 
 64 |   url = tokenbrowser::create_browser(tokens, meta, 'doc_id', token_col, filename= filename, ...)
 65 |   if (view) tokenbrowser::view_browser(url)
 66 |   invisible(url)
 67 | }
 68 | 
 69 | syntax_highlight_tokens <- function(doc_id, tokens, ann_id, value, value2, value_label, value2_label) {
 70 |   doc_i = match(doc_id, stats::na.omit(unique(doc_id)))
 71 |   ann_i = match(ann_id, stats::na.omit(unique(ann_id)))
 72 |   colindex = tapply(ann_i, doc_i, function(x) if (all(is.na(x))) rep(NA, length(x)) else (x - min(x, na.rm = TRUE)) + 1)
 73 |   colindex = as.numeric(unlist(colindex))
 74 |   
 75 |   ncolors = 8   ## repeat x colors over and over, so different colors are used for different annotation_ids, but we don't start a carnival
 76 |   colindex_mod = colindex %% ncolors + 1
 77 |   
 78 |   colors = grDevices::palette()
 79 |   #colors = grDevices::terrain.colors(ncolors+1)
 80 |   
 81 |   tcolor = colors[ifelse(is.na(value), NA, colindex_mod)]
 82 |   
 83 |   alpha = rep(0.2, length(value))
 84 |   alpha[is.na(tcolor)] = NA
 85 |   
 86 |   col = tokenbrowser::highlight_col(alpha, col=tcolor)
 87 |   tokens = tokenbrowser::tag_tokens(tokens,
 88 |                       title = ifelse(!is.na(value), sprintf('%s; %s', value_label, ann_id), NA),
 89 |                       style = tokenbrowser::attr_style(`background-color` = col, `border` = stringi::stri_paste('3px solid ', col)),
 90 |                       span_adjacent = TRUE, doc_id=doc_id)
 91 |   
 92 |   alpha = rep(0.8, length(value2))
 93 |   boxcolor = colors[ifelse(is.na(value2), NA, colindex_mod)]
 94 |   alpha[is.na(boxcolor)] = NA
 95 |   
 96 |   col = tokenbrowser::highlight_col(alpha, col=boxcolor)
 97 |   tokens = tokenbrowser::tag_tokens(tokens,
 98 |                       title = ifelse(!is.na(value2), sprintf('%s; %s', value2_label, ann_id), NA),
 99 |                       style = tokenbrowser::attr_style(`border` = stringi::stri_paste('3px solid ', col)),
100 |                       span_adjacent = TRUE, doc_id=doc_id)
101 |   
102 |   non_na_ann_i = match(ann_id, unique(ann_id))
103 |   non_na_ann_i[is.na(ann_id)] = NA
104 |   tokens = tokenbrowser::tag_tokens(tokens, 'a', tokenbrowser::tag_attr(name = stringi::stri_paste('nav', non_na_ann_i, sep='')),
105 |                       span_adjacent = TRUE, doc_id=doc_id)
106 |   
107 |   tokens
108 |   
109 | }
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/tests/testthat/test_corenlp.r:
--------------------------------------------------------------------------------
  1 | context("Corenlp")
  2 | 
  3 | ENGLISH_SAY_VERBS = c("tell", "show", " acknowledge", "admit", "affirm", "allege", "announce", "assert", "attest", "avow", "claim", "comment", "concede", "confirm", "declare", "deny", "exclaim", "insist", "mention", "note", "proclaim", "promise", "remark", "report", "say", "speak", "state", "suggest", "talk", "tell", "write", "add")
  4 | 
  5 | corenlp_quote_queries <- function(verbs=ENGLISH_SAY_VERBS, exclude_verbs=NULL) {
  6 |   direct = tquery(lemma = verbs, NOT(lemma = exclude_verbs), label='verb',
  7 |                   children(relation=c('su', 'nsubj', 'agent', 'nmod:agent'), label='source'), 
  8 |                   children(label='quote'))
  9 |   
 10 |   nosrc = tquery(POS='VB*', 
 11 |                  children(relation= c('su', 'nsubj', 'agent', 'nmod:agent'), label='source'),
 12 |                  children(lemma = verbs, NOT(lemma = exclude_verbs), relation='xcomp', label='verb',
 13 |                           children(relation=c("ccomp", "dep", "parataxis", "dobj", "nsubjpass", "advcl"), label='quote')))
 14 |   
 15 |   according = tquery(label='quote',
 16 |                      children(relation='nmod:according_to', label='source',
 17 |                               children(label='verb')))
 18 |   
 19 |   
 20 |   list(direct=direct, nosrc=nosrc, according=according)
 21 | }
 22 | 
 23 | corenlp_clause_queries <- function(verbs=NULL, exclude_verbs=ENGLISH_SAY_VERBS, with_subject=TRUE, with_object=FALSE, sub_req=TRUE, ob_req=FALSE) {
 24 |   subject_name = if (with_subject) 'subject' else NA
 25 |   object_name = if (with_object) 'object' else NA
 26 |   
 27 |   #tokens = as_tokenindex(tokens_corenlp)
 28 |   
 29 |   direct = tquery(POS = 'VB*', lemma = verbs, NOT(lemma = exclude_verbs), label='predicate',
 30 |                   children(relation = c('su', 'nsubj', 'agent'), label=subject_name, req=sub_req),
 31 |                   children(relation = c('dobj'), label=object_name, req=ob_req)) 
 32 |   
 33 |   
 34 |   passive = tquery(POS = 'VB*', lemma = verbs, NOT(lemma = exclude_verbs), label='predicate',
 35 |                    children(relation = 'auxpass'),
 36 |                    children(relation = 'nmod:agent', label=subject_name, req=FALSE),
 37 |                    children(relation = 'nsubjpass', label=object_name, req=ob_req)) 
 38 |   
 39 |   copula_direct = tquery(POS = 'VB*', lemma = verbs, NOT(lemma = exclude_verbs),
 40 |                          parents(label='predicate',
 41 |                                  children(relation = c('su', 'nsubj', 'agent'), label=subject_name, req=sub_req),
 42 |                                  children(relation = c('dobj'), label=object_name, req=ob_req))) 
 43 |   
 44 |   copula_passive = tquery(POS = 'VB*', lemma = verbs, NOT(lemma = exclude_verbs),
 45 |                           parents(label='predicate',
 46 |                                   children(relation = c('su', 'nsubj', 'agent'), label=subject_name, req=sub_req),
 47 |                                   children(relation = c('dobj'), label=object_name, req=ob_req))) 
 48 |   
 49 |   
 50 |   list(direct=direct, passive=passive, copula_direct=copula_direct, copula_passive=copula_passive)
 51 | }
 52 | 
 53 | 
 54 | get_quotes <- function(tokens, block=NULL) {
 55 |   queries = corenlp_quote_queries()
 56 |   apply_queries(tokens, queries, as_chain=TRUE, block = block, check = FALSE)
 57 | }
 58 | 
 59 | get_clauses <- function(tokens, block=NULL){
 60 |   queries = corenlp_clause_queries(with_object = TRUE)
 61 |   apply_queries(tokens, queries, as_chain=TRUE, block = block, check = FALSE)
 62 | }
 63 | 
 64 | .check <- function(tokens, nodes, ...) {
 65 |   check = list(...)
 66 |   for(name in names(check)) {
 67 |     expected = as.character(check[[name]])
 68 |     actual = get_nodes(tokens, nodes, token_cols = 'token')
 69 |     #cat(name, ': ', as.character(actual$token[actual$.ROLE == name]), '\n')
 70 |     actual = as.character(actual$token[actual$.ROLE == name])
 71 |     expect_equal(expected, actual)
 72 |   }
 73 | }
 74 | 
 75 | 
 76 | test_that("extracting quotes works with coreNLP", {
 77 |   tokens = as_tokenindex(tokens_corenlp)
 78 |   library(testthat)
 79 |   #John says Mary is great.
 80 |   quotes = get_quotes(tokens[tokens$sentence == 1,])
 81 |   testthat::expect_equal(nrow(quotes), 6)
 82 |   
 83 |   .check(tokens, quotes, source="John", verb="says", quote=c("Mary",'is','great','.'))
 84 |   
 85 |   #  Pete promised to say he loves Mary
 86 |   quotes = get_quotes(tokens[tokens$sentence == 2,])
 87 |   testthat::expect_equal(nrow(quotes), 8)
 88 |   .check(tokens, quotes, source="Pete", verb="promised", quote=c('to','say','he','loves','Mary','.'))
 89 |   
 90 |   # According to Mark, he loves Mary.
 91 |   quotes = get_quotes(tokens[tokens$sentence == 3,])
 92 |   testthat::expect_equal(nrow(quotes), 8)
 93 |   .check(tokens, quotes, source="Mark", verb=c('According','to'), quote=c(',','he','loves','Mary','.'))
 94 |   
 95 |   # John Loves Mary
 96 |   quotes = get_quotes(tokens[tokens$sentence >= 4,])
 97 |   testthat::expect_equal(nrow(quotes), 0)
 98 | 
 99 |   all_quotes = get_quotes(tokens)
100 |   testthat::expect_equal(nrow(all_quotes), 22)
101 | })
102 | 
103 | test_that("extracting clauses works with coreNLP", {
104 |   tokens = as_tokenindex(tokens_corenlp)
105 |   
106 |   # John loves Mary
107 |   clauses = get_clauses(tokens[tokens$sentence == 4,])
108 |   testthat::expect_equal(nrow(clauses), 4)
109 |   .check(tokens, clauses, subject="John", predicate=c("loves",'.'), object='Mary')
110 |   
111 |   # Mary is loved by John
112 |   clauses = get_clauses(tokens[tokens$sentence == 5,])
113 |   annotate_tqueries(tokens[tokens$sentence == 5,], 'test', corenlp_clause_queries())
114 |   testthat::expect_equal(nrow(clauses), 6)
115 |   .check(tokens, clauses, subject=c("by","John"), predicate=c("is","loved","."), object='Mary')
116 |   
117 |   # Mary is loved (passive without subject)
118 |   clauses = get_clauses(tokens[tokens$sentence == 6,])
119 |   testthat::expect_equal(nrow(clauses), 4)
120 |   .check(tokens, clauses, predicate=c("is","loved","."), object='Mary')
121 |   
122 | })
123 | 


--------------------------------------------------------------------------------
/R/isolate_branches.r:
--------------------------------------------------------------------------------
  1 | #' Isolate a branch in a dependency tree
  2 | #'
  3 | #' cuts of a branch at the nodes that match the lookup arguents (...).
  4 | #' A "tree_parent" column is added to the tokenindex, that indicates for the new roots
  5 | #' which node the parent was.  
  6 | #'
  7 | #' @param tokens   A tokenindex
  8 | #' @param ...      lookup arguments to find the node to split. For example, isolate_branch(tokens, relation='relcl') 
  9 | #'                 isolates branches of which the top node (the new root) has the relation "relcl". 
 10 | #' @param copy_parent If TRUE (default) copy the parent of the branch and include it in the isolated branch
 11 | #' @param copy_parent_fill If TRUE, also copy the parents fill nodes
 12 | #'
 13 | #' @return the tokenindex
 14 | #' @export
 15 | #'
 16 | #' @examples
 17 | #' tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
 18 | #' tokens = as_tokenindex(tokens)
 19 | #' 
 20 | #' tokens2 = isolate_branch(tokens, relation = 'relcl', copy_parent = TRUE)
 21 | #' tokens2
 22 | #' \donttest{
 23 | #' if (interactive()) plot_tree(tokens2)
 24 | #' }
 25 | isolate_branch <- function(tokens, ..., copy_parent=TRUE, copy_parent_fill=TRUE) {
 26 |   if (rsyntax_threads() != data.table::getDTthreads()) {
 27 |     old_threads = data.table::getDTthreads()
 28 |     on.exit(data.table::setDTthreads(old_threads))
 29 |     data.table::setDTthreads(rsyntax_threads())
 30 |   }
 31 |   
 32 |   parent = .ISOLATED = NULL
 33 |   
 34 |   tokens = data.table::copy(tokens)
 35 |   if (!copy_parent) {
 36 |     ## in this case there can be no issues with nesting, so we can split everything in one go
 37 |     tq = tquery(label='parent',
 38 |                 children(..., label='branch'))
 39 |     tokens = select_nodes(tokens, tq)
 40 |     tokens = mutate_nodes(tokens, 'branch', parent = NA, relation = 'ROOT', tree_parent=parent$token_id)
 41 |   } else {
 42 |     ## if we do copy the parent, we need to do it recursively from root to bottom 
 43 |     tokens[, .ISOLATED := FALSE]
 44 |     tq = tquery(label='parent', .ISOLATED=FALSE, fill=copy_parent_fill,
 45 |                         children(..., label='branch'))
 46 |     
 47 |     
 48 |     tokens = rec_isolate(tokens, tq)
 49 |     tokens[, .ISOLATED := NULL]
 50 |   }
 51 |   tokens
 52 | }
 53 | 
 54 | rec_isolate <- function(tokens, tq) {
 55 |   parent_copy = parent = NULL
 56 |   
 57 |   
 58 |   tokens = select_nodes(tokens, tq, fill_only_first=TRUE, .one_per_sentence = TRUE)
 59 |   if (nrow(selected_nodes(tokens)$nodes) == 0) return(tokens)
 60 |   tokens = copy_nodes(tokens, 'parent', 'parent_copy', only_new = F, copy_fill=TRUE)
 61 |   tokens = mutate_nodes(tokens, 'branch', parent = parent_copy$token_id)
 62 |   tokens = mutate_nodes(tokens, 'parent_copy', parent = NA, relation = 'ROOT', tree_parent=parent$parent, .ISOLATED=TRUE)
 63 |   rec_isolate(tokens, tq)
 64 | }
 65 | 
 66 | #' Add the branch id as a column to the tokenindex
 67 | #'
 68 | #' After splitting trees into branches 
 69 | #'
 70 | #' @param tokens  A tokenindex
 71 | #'
 72 | #' @return the tokenindex
 73 | #' @export
 74 | #'
 75 | #' @examples
 76 | #' tokens = tokens_spacy[tokens_spacy$doc_id == 'text4',]
 77 | #' tokens = as_tokenindex(tokens)
 78 | #' 
 79 | #' \donttest{
 80 | #' tokens2 = isolate_branch(tokens, relation = 'relcl', copy_parent = TRUE)
 81 | #' get_branch_id(tokens2)
 82 | #' }
 83 | get_branch_id <- function(tokens) {
 84 |   if (rsyntax_threads() != data.table::getDTthreads()) {
 85 |     old_threads = data.table::getDTthreads()
 86 |     on.exit(data.table::setDTthreads(old_threads))
 87 |     data.table::setDTthreads(rsyntax_threads())
 88 |   }
 89 |   
 90 |   branch_id = NULL
 91 |   
 92 |   tokens[, branch_id := ifelse(is.na(tokens$parent), tokens$token_id, NA)]
 93 |   tokens = fix_missing_parents(tokens)
 94 |   
 95 |   i = which(is.na(tokens$parent))
 96 |   safe_count = 1
 97 |   while(TRUE) {
 98 |     parents = tokens[i,c('doc_id','sentence','token_id','branch_id')]
 99 |     data.table::setnames(parents, 'token_id','parent')
100 |     parents = merge(parents, tokens[,c('doc_id','sentence','token_id','parent')], by=c('doc_id','sentence','parent'))
101 |     if (nrow(parents) == 0) break
102 |     i = tokens[parents, on=c('doc_id','sentence','token_id'), which=TRUE]
103 |     tokens[i, branch_id := parents$branch_id]
104 |     
105 |     if (safe_count == 200) {
106 |       warning("stopped recursive loop at iteration 200. This is supposedly the depth of the tree, but
107 |               since language is not THAT complex (unless you're working with German philosophers) it is
108 |               most likely that something else went wrong. Please check your data or submit a bug report if its my fault")
109 |     }
110 |     safe_count = safe_count + 1
111 |   }
112 |   tokens
113 | }
114 | 
115 | print_sentences <- function(tokens, sentence_i=1, token_col='token') {
116 |   doc_id = sentence = tree_parent_id = NULL
117 |   
118 |   sentences = unique(tokens[,c('doc_id','sentence')])
119 |   if (sentence_i > nrow(sentences)) stop('sentence_i is higher than number of sentences in tokens')
120 |   sents = get_branch_id(tokens[sentences[1,], on=c('doc_id','sentence')])
121 |   
122 |   bp = sents[!is.na(sents$tree_parent),c('doc_id','sentence','tree_parent','token_id')]
123 |   bp = merge(bp, sents[,c('doc_id','sentence','token_id','branch_id')], by.x=c('doc_id','sentence','tree_parent'), by.y=c('doc_id','sentence','token_id'), all.x=TRUE)
124 |   sents[bp, tree_parent_id := bp$branch_id, on=c('doc_id','sentence','token_id')]
125 |   
126 |   get_bp <- function(x) if (any(!is.na(x))) first(stats::na.omit(x)) else numeric()
127 |   sents = sents[,list(doc_id=unique(doc_id), sentence=unique(sentence), tree_parent=get_bp(tree_parent_id), text=paste(get(token_col), collapse=' ')), by='branch_id']
128 |   
129 |   for (i in which(is.na(sents$tree_parent))) {
130 |     rec_print_sentences(sents, i)
131 |     cat('\n')
132 |   }
133 |   tokens
134 | }
135 | 
136 | rec_print_sentences <- function(sents, ivec, level=1) {
137 |   if (length(ivec) == 0) return(NULL)
138 |   for (i in ivec) {
139 |     cat(rep('  ', level), gsub('\n', '', sents$text[i]), '\n')
140 |     rec_print_sentences(sents, which(floor(sents$tree_parent) == floor(sents$branch_id[i])), level=level+1)
141 |   }
142 | }
143 | 
144 | 
145 | 


--------------------------------------------------------------------------------
/man/add_span_quotes.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/find_quotes.r
  3 | \name{add_span_quotes}
  4 | \alias{add_span_quotes}
  5 | \title{Add span quotes to a source-quote annotations}
  6 | \usage{
  7 | add_span_quotes(
  8 |   tokens,
  9 |   text_col,
 10 |   quote_col = "quotes",
 11 |   source_val = "source",
 12 |   quote_val = "quote",
 13 |   tqueries = NULL,
 14 |   par_col = NULL,
 15 |   space_col = NULL,
 16 |   lag_sentences = 1,
 17 |   add_quote_symbols = NULL,
 18 |   quote_subset = NULL,
 19 |   copy = TRUE
 20 | )
 21 | }
 22 | \arguments{
 23 | \item{tokens}{A tokenIndex with rsyntax annotations for 'sources' and 'quotes'}
 24 | 
 25 | \item{text_col}{The column with the text (often 'token' or 'word')}
 26 | 
 27 | \item{quote_col}{The column that contains the quote annotations}
 28 | 
 29 | \item{source_val}{The value in quote_col that indicates the source}
 30 | 
 31 | \item{quote_val}{The value in quote_col that indicates the quote}
 32 | 
 33 | \item{tqueries}{A list of tqueries, that will be performed to find source candidates. The order of the queries determines which source candidates are preferred. It would make sense to use the same value as in source_val in the 'label' argument for the tquery.}
 34 | 
 35 | \item{par_col}{If available in the parser output, the column with the paragraph id. We can assume that quotes do not span across paragraphs. By using this argument, quotes that are not properly closed (uneven number of quotes) will stop at the end of the paragraph}
 36 | 
 37 | \item{space_col}{If par_col is not used, paragraphs will be identified based on hard enters in the text_col. In some parsers, there is an additional "space" column that hold the whitespace and linebreaks, which can be included here.}
 38 | 
 39 | \item{lag_sentences}{The max number of sentences looked backwards to find source candidates. Default is 1, which means the source candidates have to occur in the sentence where the quote begins (lag = 0) or the sentence before that (lag = 1)}
 40 | 
 41 | \item{add_quote_symbols}{Optionally, add additional punctuation symbols for finding quotation marks. 
 42 | In some contexts and languages it makes sense to add single quotes, but in that case it is oftne necessary to 
 43 | also use the quote_subset argument. For instance, in Spacy (and probably other UD based annotations), single quotes in posessives (e.g., Bob's, scholars') have a
 44 | PART POS tag, whereas quotation symbols have PUNCT, NOUN, VERB, or ADJ (for some reason).}
 45 | 
 46 | \item{quote_subset}{Optionally, an expression to be evaluated on the columns of 'tokens' for selecting/deselecting tokens that can/cant be quotation marks. For example,
 47 | pos != "PART" can be used for the example mentioned in add_quote_symbols.}
 48 | 
 49 | \item{copy}{If TRUE, deep copy the data.table (use if output tokens do not overwrite input tokens)}
 50 | }
 51 | \value{
 52 | the tokenIndex
 53 | }
 54 | \description{
 55 | Quotes can span across sentences, which makes it impossible to find them based on dependency tree quories. 
 56 | This function can be used as post-processing, AFTER using tqueries to find 'source' and 'quote' nodes, to add some of these quotes. 
 57 | 
 58 | The quotes themselves are often easy to detect due to the use of quotation marks. There are two common ways of indicating the sources.
 59 | 
 60 | Firstly, the source might be used before the start of the quote (Steve said: "hey a quote!". "I like quotes!").
 61 | Secondly, the source might be implied in the sentence where the quotes starts, or the sentence before that (Steve was mad. "What a stupid way of quoting me!"). 
 62 | 
 63 | In the first case, the source can be found with a tquery. If there is a source (source_val) in the quote_col that is linked to a part of the quote (quote_val), this function will add the rest of the quote. 
 64 | 
 65 | In the second case, we can look for candidates near the beginning of the quote. The candidate criteria can be specified as tqueries
 66 | }
 67 | \examples{
 68 | ## This function is best used after first annotating regular quotes
 69 | ## Here we first apply 3 tqueries for annotating quotes in spacy tokens
 70 | 
 71 | \donttest{
 72 | 
 73 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text6',]
 74 | 
 75 | verbs = c("tell", "show", "acknowledge", "admit", "affirm", "allege", 
 76 |   "announce", "assert", "attest", "avow", "call", "claim", "comment", 
 77 |   "concede", "confirm", "declare", "deny", "exclaim", "insist", "mention", 
 78 |   "note", "post","predict", "proclaim", "promise", "reply", "remark", 
 79 |   "report", "say", "speak", "state", "suggest", "talk", "tell", "think",
 80 |   "warn","write", "add")
 81 | 
 82 | direct = tquery(lemma = verbs, label='verb',
 83 |    children(req=FALSE, relation = c('npadvmod'), block=TRUE),
 84 |    children(relation=c('su','nsubj','agent','nmod:agent'), label='source'),
 85 |    children(label='quote'))
 86 | 
 87 | nosrc = tquery(pos='VERB*',
 88 |    children(relation= c('su', 'nsubj', 'agent', 'nmod:agent'), label='source'),
 89 |    children(lemma = verbs, relation='xcomp', label='verb',
 90 |      children(relation=c("ccomp","dep","parataxis","dobj","nsubjpass","advcl"), label='quote')))
 91 | 
 92 | according = tquery(label='quote',
 93 |    children(relation='nmod:according_to', label='source',
 94 |         children(label='verb')))
 95 | 
 96 | tokens = annotate_tqueries(tokens, 'quote', dir=direct, nos=nosrc, acc=according)
 97 | tokens
 98 | 
 99 | ## now we add the span quotes. If a span quote is found, the algorithm will first
100 | ## look for already annotated sources as source candidates. If there are none,
101 | ## additional tqueries can be used to find candidates. Here we simply look for
102 | ## the most recent PERSON entity
103 | 
104 | tokens = tokens_spacy[tokens_spacy$doc_id == 'text6',]
105 | tokens = annotate_tqueries(tokens, 'quote', dir=direct, nos=nosrc, acc=according)
106 | 
107 | 
108 | last_person = tquery(entity = 'PERSON*', label='source')
109 | tokens = add_span_quotes(tokens, 'token', 
110 |                          quote_col = 'quote', source_val = 'source', quote_val = 'quote', 
111 |                          tqueries=last_person)
112 | tokens
113 | 
114 | ## view as full text
115 | syntax_reader(tokens, annotation = 'quote', value = 'source')
116 | }
117 | }
118 | 


--------------------------------------------------------------------------------
/R/token_index.r:
--------------------------------------------------------------------------------
  1 | #' Prepare a tokenIndex
  2 | #' 
  3 | #' @description
  4 | #' Creates a tokenIndex data.table. 
  5 | #' Accepts any data.frame given that the required columns (doc_id, sentence, token_id, parent, relation) are present.
  6 | #' The names of these columns must be one of the values specified in the respective arguments.
  7 | #'
  8 | #' The data in the data.frame will not be changed, with three exceptions. First, the columnnames will be changed if the default values are not used.
  9 | #' Second, if a token has itself as its parent (which in some parsers is used to indicate the root), the parent is set to NA (as used in other parsers) to prevent infinite cycles.
 10 | #' Third, the data will be sorted by doc_id, sentence, token_id.
 11 | #'
 12 | #' @param tokens     A data.frame, data.table, or tokenindex. 
 13 | #' @param doc_id     candidate names for the document id columns
 14 | #' @param sentence   candidate names for sentence (id/index) column
 15 | #' @param token_id   candidate names for the  token id column. Has to be numeric (Some parsers return token_id's as numbers with a prefix (t_1, w_1))
 16 | #' @param parent     candidate names for the parent id column. Has to be numeric
 17 | #' @param relation   candidate names for the relation column
 18 | #' @param paragraph  Optionally, the name of a column with paragraph ids. This is only necessary if sentences are numbered per paragraph, and therefore not unique within documents. If given, sentences are re-indexed to be unique within documents.
 19 | #'
 20 | #' @return   a tokenIndex
 21 | #' @export
 22 | #' @examples
 23 | #' as_tokenindex(tokens_corenlp)
 24 | as_tokenindex <- function(tokens, doc_id=c('doc_id','document_id'), sentence=c('sentence', 'sentence_id'), token_id=c('token_id'), parent=c('parent','head_token_id'), relation=c('relation','dep_rel'), paragraph=NULL) {
 25 |   if (rsyntax_threads() != data.table::getDTthreads()) {
 26 |     old_threads = data.table::getDTthreads()
 27 |     on.exit(data.table::setDTthreads(old_threads))
 28 |     data.table::setDTthreads(rsyntax_threads())
 29 |   }
 30 |   
 31 |   new_index = !methods::is(tokens, 'tokenIndex')
 32 |     
 33 |   ## if we can confirm that this is udpipe input, do not give a warning for missing parents
 34 |   is_udpipe = all(c('doc_id','token_id','head_token_id','dep_rel') %in% colnames(tokens))
 35 |   warn = !is_udpipe
 36 | 
 37 |   for (cols_obj in c('doc_id','sentence','token_id','parent','relation')) {
 38 |     cols = get(cols_obj)
 39 |     in_tokens = cols %in% colnames(tokens)
 40 |     if (!any(in_tokens)) stop(sprintf('None of the default values in c(%s) is a valid column in tokens', paste(cols,collapse=', ')))
 41 |     col = cols[which(in_tokens)[1]]
 42 |     assign(cols_obj, value = col)
 43 |   }
 44 |   
 45 |   if (!methods::is(tokens, 'data.table')) {
 46 |     tokens = data.table::data.table(tokens)
 47 |     data.table::setnames(tokens, old = c(doc_id, sentence, token_id, parent, relation), new=c('doc_id','sentence','token_id','parent', 'relation'))
 48 |   } else {
 49 |     ## if already a data.table, do not change by reference
 50 |     if (!all(c('doc_id','sentence','token_id','parent','relation') %in% colnames(tokens))) {
 51 |       colnames(tokens)[match(c(doc_id,sentence,token_id,parent,relation), colnames(tokens))] = c('doc_id','sentence','token_id','parent','relation')
 52 |     }
 53 |   }
 54 |   
 55 |   ## token_id and parent need to be identical (not integer vs numeric)
 56 |   tokens$token_id = as.numeric(tokens$token_id)   
 57 |   tokens$parent = as.numeric(tokens$parent)
 58 |   
 59 |   ## in some cases (such as udpipe) sentence_id has the sentence index, and sentence is a text column
 60 |   if ('sentence_id' %in% colnames(tokens)) {
 61 |     tokens$sentence_txt = tokens$sentence
 62 |     tokens$sentence = tokens$sentence_id
 63 |     tokens$sentence_id = NULL
 64 |   }
 65 |   if (!methods::is(tokens$sentence, 'numeric')) {
 66 |     if (methods::is(tokens$sentence, 'factor')) 
 67 |       tokens$sentence = as.numeric(tokens$sentence)
 68 |     else {
 69 |       ## create a counter that increments for every new sentence within a document 
 70 |       tokens$sentence = tokens$sentence != data.table::shift(tokens$sentence, 1, fill=NA) 
 71 |       tokens$sentence[1] = TRUE
 72 |       tokens[, sentence := cumsum(sentence), by='doc_id']
 73 |     }
 74 |   }
 75 |   
 76 |   if (!is.null(paragraph)) {
 77 |     browser()
 78 |     
 79 |     .sentence = NULL; sentence = NULL
 80 |     data.table::setorderv(tokens, c('doc_id',paragraph, 'sentence','token_id'))
 81 |     sents = unique(tokens[,c('doc_id',paragraph,'sentence'), with=F], by = c('doc_id',paragraph, 'sentence'))
 82 |     sents[, .sentence := 1:length(sentence), by=c('doc_id')]
 83 |     tokens[sents, .sentence := .sentence, on=c('doc_id',paragraph,'sentence')]  
 84 |   }
 85 |   if (anyDuplicated(tokens[,c('doc_id','sentence','token_id')])) stop('tokenIndex has duplicate doc_id - sentence - token_id tripples. This can for instance happen if sentences are numbered within paragraphs (sentence 1 in par 1, sentence 1 in par 2, etc). If this is the cause, you might solve it with the "paragraph" argument.')
 86 | 
 87 |   if (new_index) {
 88 |     is_own_parent = tokens$parent == tokens$token_id
 89 |     is_own_parent[is.na(is_own_parent)] = FALSE
 90 |     if (any(is_own_parent)) tokens$parent[is_own_parent] = NA
 91 |     levels(tokens$relation) = union(levels(tokens$relation), 'ROOT')
 92 |     tokens$relation[is.na(tokens$parent)] = 'ROOT'
 93 |   }
 94 | 
 95 |   has_keys = data.table::key(tokens)
 96 |   if (!identical(has_keys, c('doc_id','sentence','token_id'))) data.table::setkeyv(tokens, c('doc_id','sentence','token_id'))
 97 |   has_indices = data.table::indices(tokens)
 98 |   if (!'doc_id__sentence__parent' %in% has_indices) data.table::setindexv(tokens, c('doc_id','sentence','parent'))
 99 |   if (!'relation' %in% has_indices) data.table::setindexv(tokens, 'relation')
100 |   
101 |   if (new_index) {
102 |     tokens = fix_missing_parents(tokens, warn)
103 |     data.table::setattr(tokens, name = 'class', c('tokenIndex', class(tokens)))
104 |   }
105 |   
106 |   
107 |   tokens[]
108 | }
109 | 
110 | fix_missing_parents <- function(tokens, warn=TRUE) {
111 |   parent = NULL; relation = NULL
112 |   parent_ids = stats::na.omit(unique(tokens[,c('doc_id','sentence','parent')]))
113 |   data.table::setnames(parent_ids, old='parent', new='token_id')
114 |   missing_parents = parent_ids[!tokens, on=c('doc_id','sentence','token_id')]
115 |   if (warn && nrow(missing_parents) > 0) warning(sprintf('There are %s tokens with missing parents. These have now been made roots (parent = NA, relation="ROOT")', nrow(missing_parents)))
116 |   
117 |   if (nrow(missing_parents) > 0) {
118 |     data.table::setnames(missing_parents, old='token_id', new='parent')
119 |     i = tokens[missing_parents, on=c('doc_id','sentence','parent'), which=TRUE]
120 |     tokens[i, parent := NA]
121 |     tokens[i, relation := "ROOT"]
122 |   }
123 |   tokens
124 | }
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/man/nested_nodes.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/tquery.r
  3 | \name{nested_nodes}
  4 | \alias{nested_nodes}
  5 | \alias{children}
  6 | \alias{not_children}
  7 | \alias{parents}
  8 | \alias{not_parents}
  9 | \title{Search for parents or children in tquery}
 10 | \usage{
 11 | children(
 12 |   ...,
 13 |   g_id = NULL,
 14 |   label = NA,
 15 |   req = TRUE,
 16 |   depth = 1,
 17 |   connected = FALSE,
 18 |   fill = TRUE,
 19 |   block = FALSE,
 20 |   max_window = c(Inf, Inf),
 21 |   min_window = c(0, 0)
 22 | )
 23 | 
 24 | not_children(
 25 |   ...,
 26 |   g_id = NULL,
 27 |   depth = 1,
 28 |   connected = FALSE,
 29 |   max_window = c(Inf, Inf),
 30 |   min_window = c(0, 0)
 31 | )
 32 | 
 33 | parents(
 34 |   ...,
 35 |   g_id = NULL,
 36 |   label = NA,
 37 |   req = TRUE,
 38 |   depth = 1,
 39 |   connected = FALSE,
 40 |   fill = TRUE,
 41 |   block = FALSE,
 42 |   max_window = c(Inf, Inf),
 43 |   min_window = c(0, 0)
 44 | )
 45 | 
 46 | not_parents(
 47 |   ...,
 48 |   g_id = NULL,
 49 |   depth = 1,
 50 |   connected = FALSE,
 51 |   max_window = c(Inf, Inf),
 52 |   min_window = c(0, 0)
 53 | )
 54 | }
 55 | \arguments{
 56 | \item{...}{Accepts two types of arguments: name-value pairs for finding nodes (i.e. rows), and functions to look for parents/children of these nodes.
 57 | 
 58 | The name in the name-value pairs need to match a column in the data.table, and the value needs to be a vector of the same data type as the column.
 59 | By default, search uses case sensitive matching, with the option of using common wildcards (* for any number of characters, and ? for a single character).
 60 | Alternatively, flags can be used to to change this behavior to 'fixed' (__F), 'igoring case' (__I) or 'regex' (__R). See details for more information. 
 61 | 
 62 | If multiple name-value pairs are given, they are considered as AND statements, but see details for syntax on using OR statements, and combinations.
 63 | 
 64 | To look for parents and children of the nodes that are found, you can use the \link{parents} and \link{children} functions as (named or unnamed) arguments. 
 65 | These functions have the same query arguments as tquery, but with some additional arguments.}
 66 | 
 67 | \item{g_id}{Find nodes by global id, which is the combination of the doc_id, sentence and token_id. Passed as a data.frame or data.table with 3 columns: (1) doc_id, (2) sentence and (3) token_id.}
 68 | 
 69 | \item{label}{A character vector, specifying the column name under which the selected tokens are returned. 
 70 | If NA, the column is not returned.}
 71 | 
 72 | \item{req}{Can be set to false to not make a node 'required'. This can be used to include optional nodes in queries. For instance, in a query for finding subject - verb - object triples, 
 73 | make the object optional.}
 74 | 
 75 | \item{depth}{A positive integer, determining how deep parents/children are sought. 1 
 76 | means that only direct parents and children of the node are retrieved. 2 means children and grandchildren, etc.
 77 | All parents/children must meet the filtering conditions (... or g_id)}
 78 | 
 79 | \item{connected}{Controls behavior if depth > 1 and filters are used. If FALSE, all parents/children to the given depth are retrieved, and then filtered. 
 80 | This way, grandchildren that satisfy the filter conditions are retrieved even if their parents do not satisfy the conditions.
 81 | If TRUE, the filter is applied at each level of depth, so that only fully connected branches of nodes that satisfy the conditions are retrieved.}
 82 | 
 83 | \item{fill}{Logical. If TRUE (default), the default custom_fill() will be used. To more specifically control fill, you can nest the \link{custom_fill} 
 84 | function (a special version of the children function).}
 85 | 
 86 | \item{block}{Logical. If TRUE, the node will be blocked from being assigned (labeld). This is mainly usefull if you have a node that you do not want to be assigned by fill,
 87 | but also don't want to 'label' it. Essentially, block is shorthand for using label and then removing the node afterwards. If block is TRUE, label has to be NA.}
 88 | 
 89 | \item{max_window}{Set the max token distance of the children/parents to the node. Has to be either a numerical vector of length 1 for distance in both directions, or a 
 90 | vector of length 2, where the first value is the max distance to the left, and the second value the max distance to the right. Default is c(Inf, Inf) meaning that no max distance is used.}
 91 | 
 92 | \item{min_window}{Like max_window, but for the min distance. Default is c(0,0) meaning that no min is used.}
 93 | }
 94 | \value{
 95 | Should not be used outside of \link{tquery}
 96 | }
 97 | \description{
 98 | Enables searching for parents or children.
 99 | Should only be used inside of the \link{tquery} function, or within other children/parents functions.
100 | Look-up conditions are specified in the same way as in the tquery function.
101 |  
102 | Multiple children() or parents() functions can be nested side by side.
103 | This works as an AND condition: the node must have all these parents/children (unless the req [required] argument is set to FALSE).
104 | 
105 | 
106 | The custom_fill() function is used to include the children of a 'labeled' node. It can only be nested in a query if the label argument is not NULL,
107 | and by default will include all children of the node that have not been assigned to another node. If two nodes have a shared child, the child will be
108 | assigned to the closest node.
109 | }
110 | \details{
111 | Having nested queries can be confusing, so we tried to develop the find_nodes function and the accompanying functions in a way
112 | that clearly shows the different levels. As shown in the examples, the idea is that each line is a node, and to look for parents
113 | or children, we put them on the next line with indentation (in RStudio, it should automatically align correctly when you press enter inside
114 | of the children() or parents() functions). 
115 | 
116 | There are several flags that can be used to change search condition. To specify flags, add a double underscore and the flag character to the name in the name value pairs (...).
117 | By adding the suffix __R, query terms are considered to be regular expressions, and the suffix __I uses case insensitive search (for normal or regex search).
118 | If the suffix __F is used, only exact matches are valid (case sensitive, and no wildcards).
119 | Multiple flags can be combined, such as lemma__RI, or lemma__IR  (order of flags is irrelevant)
120 | 
121 | The not_children and not_parents functions will make the matched children/parents a NOT condition. Note that this is different from using the NOT() look-up function.
122 | NOT operates at the node level, so you specify that a node should NOT be matched if certain conditions are met. the not_parents and not_children functions operate 
123 | at the pattern level, so you can specify that a pattern is invalid if these parents/children are matched.
124 | 
125 | Next to the OR, AND, and NOT functions, children/parents functions can have the special BREAK function for cases where depth > 1.
126 | If depth > 1 in the children, parents or fill function, the children/parents will
127 | be retrieved recursively (i.e. children, children of children, etc.).
128 | If the look-up conditions (e.g., relation = 'nsubj') are not satisfied, a node 
129 | will not be matched by the query, but the search will still continue for it's
130 | parents/children. The special BREAK look-up function allows you to specify a condition
131 | for breaking the recursive loop (lending it's name from the `break` in a for loop).
132 | An example is that you might want to stop the recursive loop in a custom_fill() once it encounters
133 | a nested sentence, such as a relative clause: custom_fill(BREAK(relation = 'relcl')).
134 | }
135 | 


--------------------------------------------------------------------------------
/R/find_nodes.r:
--------------------------------------------------------------------------------
  1 | find_nodes <- function(tokens, tquery, block=NULL, use_index=TRUE, name=NA, fill=TRUE, melt=TRUE, root_dist=FALSE) {
  2 |   .MATCH_ID = NULL; .DROP = NULL; .ID = NULL ## declare data.table bindings
  3 |   tokens = as_tokenindex(tokens)  
  4 |   block = get_long_ids(block)
  5 |   
  6 |   nodes = filter_tokens(tokens, lookup=tquery$lookup, .G_ID=tquery$g_id, .BLOCK=block, use_index=use_index)
  7 |   if (nrow(nodes) == 0) return(NULL)
  8 |   nodes = subset(nodes, select = c('doc_id','sentence','token_id'))
  9 | 
 10 |   any_nonfill_nested = any(sapply(tquery$nested, function(x) !methods::is(x, 'tQueryFill')))
 11 |   if (any_nonfill_nested) {
 12 |     nodes = find_nested(tokens, nodes, tquery, block, fill=FALSE, block_loop=F)
 13 |   } else {
 14 |     data.table::setnames(nodes, old = 'token_id', new='.ID')
 15 |     if (!is.na(tquery$label)) nodes[,(tquery$label) := .ID]
 16 |   } 
 17 | 
 18 |   if (is.null(nodes)) return(NULL)  
 19 |   if (nrow(nodes) == 0) return(NULL)
 20 | 
 21 |   ### possible solution for removing block within rec_search
 22 |   nodes = get_root_dist(tokens, nodes)
 23 |   nodes = get_unique_patterns(nodes)
 24 |   if (!root_dist) nodes$.ROOT_DIST = NULL
 25 |   
 26 |   if (fill) nodes = add_fill(tokens, nodes, tquery, block=nodes)
 27 | 
 28 |   
 29 |   nodes = create_unique_key(nodes, name, tquery)
 30 |   if (melt) {
 31 |     nodes = melt_nodes_list(nodes)
 32 |   }
 33 |   nodes[]
 34 | }
 35 | 
 36 | find_nested <- function(tokens, nodes, tquery, block, fill, block_loop) {
 37 |   .ID = NULL; .MATCH_ID = NULL
 38 | 
 39 |   nodes = rec_find(tokens, ids=nodes, ql=tquery$nested, block=block, fill=fill, block_loop=block_loop)
 40 |   
 41 |   if (nrow(nodes) == 0) return(NULL)
 42 |   nodes[, .ID := .MATCH_ID]
 43 |   data.table::setcolorder(nodes, c('.ID', setdiff(colnames(nodes), '.ID')))
 44 |   
 45 |   if (is.na(tquery$label)) {
 46 |     nodes[,.MATCH_ID := NULL]
 47 |   } else {
 48 |     data.table::setnames(nodes, '.MATCH_ID', tquery$label)
 49 |   }
 50 |   
 51 |   dropcols = grep('.DROP.*', colnames(nodes), value=TRUE)
 52 |   if (length(dropcols) > 0) nodes[, (dropcols) := NULL]
 53 |   
 54 |   unique(nodes)
 55 | }
 56 | 
 57 | add_fill <- function(tokens, nodes, tquery, block, level=1) {
 58 |   is_fill = sapply(tquery$nested, methods::is, 'tQueryFill')
 59 | 
 60 |   if (any(!is_fill)) {
 61 |     for (tq in tquery$nested[!is_fill]) {
 62 |       nodes = add_fill(tokens, nodes, tq, block, level+1)
 63 |     }
 64 |   } 
 65 |   
 66 |   if (any(is_fill)) {
 67 |     if (is.na(tquery$label)) {
 68 |       if (level == 1) match_id = '.ID' else return(nodes)
 69 |     } else match_id = tquery$label
 70 |     if (!match_id %in% colnames(nodes)) return(nodes)
 71 |     ids = subset(nodes, select = c('doc_id','sentence',match_id))
 72 |     ids = unique(stats::na.omit(ids))
 73 |     add = rec_find(tokens, ids, tquery$nested[is_fill], block = block, fill=TRUE, block_loop=T)
 74 |     
 75 |     if (grepl('#', tquery$label)) {
 76 |       label = gsub('#.*', '', tquery$label)
 77 |       label = paste0('^', label, '\\_')
 78 |       colnames(add) = gsub(label, tquery$label, colnames(add))
 79 |     }
 80 | 
 81 |     if (nrow(add) > 0) {
 82 |       setkeyv(nodes, c('doc_id','sentence',match_id))
 83 |       nodes = merge(nodes, add, by.x=c('doc_id','sentence',match_id), by.y=c('doc_id','sentence','.MATCH_ID'), all.x=TRUE, allow.cartesian=TRUE)
 84 |       dropcols = grep('.DROP.*', colnames(nodes), value=TRUE)
 85 |       if (length(dropcols) > 0) nodes[, (dropcols) := NULL]
 86 |     }
 87 |   }
 88 |   unique(nodes)
 89 | }
 90 | 
 91 | get_top_label <- function(tquery) {
 92 |   ## get the first label in a tquery.
 93 |   if (!is.na(tquery$label)) return(tquery$label)
 94 |   if (!is.null(tquery$nested)) {
 95 |     for (nested in tquery$nested) {
 96 |       label = get_top_label(nested)
 97 |       if (!is.na(label)) return(label)
 98 |     }
 99 |   }
100 |   return(NA)
101 | }
102 | 
103 | create_unique_key <- function(nodes, name, tquery){
104 |   id_col = get_top_label(tquery)
105 |   if (!is.na(name)) {
106 |     key = paste0(name, '#', nodes$doc_id, '.', nodes$sentence, '.', nodes[[id_col]])
107 |   } else {
108 |     key = paste0(nodes$doc_id, '.', nodes$sentence, '.', nodes[[id_col]])
109 |   }
110 | 
111 |   nodes$.ID = paste0(nodes$doc_id, '...', nodes$sentence, '...', nodes$.ID) ## quick fix for matching on 3 columns
112 |   key = key[match(nodes$.ID, nodes$.ID)] ## give same id to nodes with same .ID
113 |   #key = paste0(name, '#', 1:nrow(nodes))
114 |   nodes$.ID = key
115 |   return(nodes)
116 | }
117 | 
118 | get_root_dist <- function(tokens, nodes) {
119 |   .ROOT_DIST = NULL
120 |   tf = token_family(tokens, unique(data.table(doc_id=nodes$doc_id, sentence=nodes$sentence, token_id=nodes$.ID)), 
121 |                     depth=Inf, level='parents', minimal=TRUE, show_level=TRUE, replace=TRUE)
122 |   tf = data.table::setorderv(tf, cols = '.FILL_LEVEL', order = -1)
123 |   tf = unique(tf, by=c('doc_id','sentence','.MATCH_ID'))
124 |   data.table::setnames(tf, c('.FILL_LEVEL', '.MATCH_ID'), c('.ROOT_DIST', '.ID'))
125 |   tf = subset(tf, select=c('doc_id','sentence','.ID','.ROOT_DIST'))
126 |   nodes = merge(nodes, tf, by = c('doc_id','sentence','.ID'), all.x=TRUE)
127 |   #nodes = nodes[list(tf$doc_id, tf$sentence, tf$.MATCH_ID), .ROOT_DIST := tf$.FILL_LEVEL, on=c('doc_id','sentence','.ID')]
128 |   nodes[is.na(nodes$.ROOT_DIST), .ROOT_DIST := 0]
129 |   nodes
130 | }
131 | 
132 | 
133 | 
134 | get_unique_patterns <- function(nodes) {
135 |   ln = nodes
136 |   ln$i = 1:nrow(ln)
137 |   ln = data.table::melt(ln, id.vars=c('doc_id','sentence','.ID','i','.ROOT_DIST'))
138 |   ln = ln[!is.na(ln$value),]
139 |   data.table::setorderv(ln, c('doc_id','sentence','.ID','i'))
140 | 
141 |   ## rm duplicate i-value pairs
142 |   rm_i = unique(ln$i[duplicated(ln[,c('i','value')])])
143 |   if (length(rm_i > 0)) ln = ln[-ln[list(i=rm_i), on='i', which=T]]
144 |   
145 |   ## If nodes are matched multiple times, remove the ones where the root dist is higher
146 |   ## (these are most often nested in the other pattern, unless very compliated tqueries are used)
147 |   possible_dupl = unique(ln$value[duplicated(ln[,c('doc_id','sentence','value')])])
148 |   possible_dupl = unique(ln[list(value=possible_dupl),,on='value']$i)
149 |   dup = get_duplicates(ln[list(i=possible_dupl),,on='i'])
150 |   
151 |   ## A complication is that once we remove a duplicate/nested pattern, 
152 |   ## It might also solve another duplicate. So we can't just remove
153 |   ## all duplicates. We can see which duplicates certainly need to be removed
154 |   ## by looking which duplicates are not 'solved' by removing other duplicates
155 |   ## We then repeat this until no remain
156 |   ## this loop is guaranteed to remove one pattern per sentence per iteration (so it's fairly short) 
157 |   rm_j = rep(F, nrow(nodes))
158 |   while (TRUE) {
159 |     dupl_pat = unique(dup$i.x)    ## what are patterns (by index i in nodes) that seem to be duplicates?
160 |     possible_nondupl = dup[list(i.y = dupl_pat),,on='i.y',which=T,nomatch=0] ## get all duplicates where the matched pattern is a duplicate in another pattern 
161 |     if (length(possible_nondupl) > 0) {
162 |       certain_dupl = dup[-possible_nondupl]    ## if the matched pattern is not a duplicate in another pattern, we can be sure it's a definitive duplicate
163 |       rm_j[certain_dupl$i.x] = T
164 |     } else {
165 |       rm_j[dup$i.x] = T
166 |       break
167 |     }
168 |     possible_dupl = setdiff(possible_nondupl, unique(certain_dupl$i.x))  ## now repeat for remaining possible duplicates
169 |     dup = get_duplicates(ln[list(i=possible_dupl),,on='i'])
170 |   }
171 |   rm_j = which(rm_j)
172 | 
173 |   nodes
174 |   if (length(rm_i) > 0 || length(rm_j) > 0)
175 |     nodes = nodes[-unique(c(rm_i, rm_j)),]
176 |   
177 |   nodes
178 |   
179 | }  
180 | 
181 | get_duplicates <- function(ln, priority='higher') {
182 |   ln_m = merge(ln[,!colnames(ln) == 'variable', with=F], 
183 |                ln[,c('doc_id','sentence','.ID','.ROOT_DIST','value','i')], 
184 |                by=c('doc_id','sentence','value'), allow.cartesian = T)
185 |   ln_m = ln_m[ln_m$.ID.x != ln_m$.ID.y,]
186 |   
187 |   if (priority == 'higher') {
188 |     dupl = ifelse(ln_m$.ROOT_DIST.x != ln_m$.ROOT_DIST.y, 
189 |                   ln_m$.ROOT_DIST.x > ln_m$.ROOT_DIST.y,       ## remove x if x lower in tree
190 |                   ln_m$.ID.x > ln_m$.ID.y)                     ## and if same height remove x if to the left in sentence
191 |   } else {
192 |     dupl = ifelse(ln_m$.ROOT_DIST.x != ln_m$.ROOT_DIST.y, 
193 |                   ln_m$.ROOT_DIST.x < ln_m$.ROOT_DIST.y,       ## remove x if x higher in tree
194 |                   ln_m$.ID.x > ln_m$.ID.y)                     ## and if same height remove x if to the left in sentence
195 |   }
196 |   ui = unique(ln_m$i.x[dupl])
197 |   ln_m[list(i.x=ui),,on='i.x']
198 | }


--------------------------------------------------------------------------------
/R/annotate.r:
--------------------------------------------------------------------------------
  1 | #' Annotate a tokenlist based on rsyntax queries
  2 | #'
  3 | #' @description 
  4 | #' Apply queries to extract syntax patterns, and add the results as three columns to a tokenlist.
  5 | #' The first column contains the ids for each hit. The second column contains the annotation label. The third column contains the fill level (which you probably won't use, but is important for some functionalities).
  6 | #' Only nodes that are given a name in the tquery (using the 'label' parameter) will be added as annotation.
  7 | #' 
  8 | #' Note that while queries only find 1 node for each labeld component of a pattern (e.g., quote queries have 1 node for "source" and 1 node for "quote"), 
  9 | #' all children of these nodes can be annotated by settting fill to TRUE. If a child has multiple ancestors, only the most direct ancestors are used (see documentation for the fill argument).
 10 | #' 
 11 | #' @param tokens      A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.
 12 | #' @param column      The name of the column in which the annotations are added. The unique ids are added as column_id
 13 | #' @param ...         One or multiple tqueries, or a list of queries, as created with \link{tquery}. Queries can be given a named by using a named argument, which will be used in the annotation_id to keep track of which query was used. 
 14 | #' @param block       Optionally, specify ids (doc_id - sentence - token_id triples) that are blocked from querying and filling (ignoring the id and recursive searches through the id). 
 15 | #' @param fill        Logical. If TRUE (default) also assign the fill nodes (as specified in the tquery). Otherwise these are ignored 
 16 | #' @param overwrite   Applies if column already exists. If TRUE, existing column will be overwritten. If FALSE, the existing annotations in the column will be blocked, and new annotations will be added. This is identical to using multiple queries.
 17 | #' @param block_fill  If TRUE (and overwrite is FALSE), the existing fill nodes will also be blocked. In other words, the new annotations will only be added if the 
 18 | #' @param copy        If TRUE (default), the data.table is copied. Otherwise, it is changed by reference. Changing by reference is faster and more memory efficient, but is not predictable R style, so is optional. 
 19 | #' @param verbose     If TRUE, report progress (only usefull if multiple queries are given)
 20 | #' 
 21 | #' @export
 22 | #' @return The tokenIndex data.table with the annotation columns added
 23 | #' @examples
 24 | #' ## spacy tokens for: Mary loves John, and Mary was loved by John
 25 | #' tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
 26 | #' 
 27 | #' ## two simple example tqueries
 28 | #' passive = tquery(pos = "VERB*", label = "predicate",
 29 | #'                  children(relation = c("agent"), label = "subject"))
 30 | #' active =  tquery(pos = "VERB*", label = "predicate",
 31 | #'                  children(relation = c("nsubj", "nsubjpass"), label = "subject"))
 32 | #' 
 33 | #' tokens = annotate_tqueries(tokens, "clause", pas=passive, act=active)
 34 | #' tokens
 35 | #' \donttest{ 
 36 | #' if (interactive()) plot_tree(tokens, annotation='clause')
 37 | #' }
 38 | annotate_tqueries <- function(tokens, column, ..., block=NULL, fill=TRUE, overwrite=NA, block_fill=FALSE, copy=TRUE, verbose=FALSE) {
 39 |   if (rsyntax_threads() != data.table::getDTthreads()) {
 40 |     old_threads = data.table::getDTthreads()
 41 |     on.exit(data.table::setDTthreads(old_threads))
 42 |     data.table::setDTthreads(rsyntax_threads())
 43 |   }
 44 |   
 45 |   queries = list(...)
 46 |   is_tquery = sapply(queries, methods::is, 'tQuery')
 47 |   queries = c(queries[is_tquery], unlist(queries[!is_tquery], recursive = FALSE))
 48 |   
 49 |   
 50 |   tokens = as_tokenindex(tokens)
 51 |   if (copy) tokens = data.table::copy(tokens)
 52 |   id_column = paste0(column, '_id')    
 53 |   fill_column = paste0(column, '_fill')
 54 |   
 55 |   if (column %in% colnames(tokens)) {
 56 |     if (is.na(overwrite)) stop(sprintf('The specified column (%s) already exists. Set overwrite argument to TRUE to overwrite the column or FALSE to consider existing annotations as a chain.', column))
 57 |     if (overwrite) {
 58 |       tokens[, (column) := NULL] 
 59 |       if (id_column %in% colnames(tokens)) tokens[, (id_column) := NULL]
 60 |       if (fill_column %in% colnames(tokens)) tokens[, (fill_column) := NULL]
 61 |     } else {
 62 |       if (!fill_column %in% colnames(tokens)) stop(sprintf('fill column (%s) is not available', fill_column))
 63 |       i = if (block_fill) which(!is.na(tokens[,get(fill_column)])) else which(tokens[,get(fill_column)] == 0)
 64 |       block = get_long_ids(block, tokens[i, c('doc_id','sentence','token_id')])
 65 |     }
 66 |   }
 67 |   
 68 |   nodes = apply_queries(tokens, queries, as_chain=TRUE, block=block, fill=fill, verbose=verbose)
 69 | 
 70 |   if (nrow(nodes) == 0) {
 71 |     fill_column = paste0(column, '_fill')
 72 |     if (!column %in% colnames(tokens)) tokens[, (column) := factor()]
 73 |     if (!id_column %in% colnames(tokens)) tokens[, (id_column) := factor()]
 74 |     if (!fill_column %in% colnames(tokens)) tokens[, (fill_column) := double()]
 75 |     return(tokens[])
 76 |   }
 77 |   tokens = annotate_nodes(tokens, nodes, column=column)
 78 |   tokens[]
 79 | }
 80 | 
 81 | #' Annotate a tokenlist based on rsyntaxNodes
 82 | #' 
 83 | #' Use rsyntaxNodes, as created with \link{tquery} and \link{apply_queries}, to annotate a tokenlist.
 84 | #' Three columns will be added: a unique id for the query match, the labels assigned in the tquery, and a column with the fill level (0 is direct match, 1 is child of match, 2 is grandchild, etc.).
 85 | #' 
 86 | #' Note that you can also directly use \link{annotate}.
 87 | #' 
 88 | #' @param tokens  A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.
 89 | #' @param nodes   An rsyntaxNodes A data.table, as created with \link{apply_queries}. Can be a list of multiple data.tables.
 90 | #' @param column  The name of the column in which the annotations are added. The unique ids are added as [column]_id, and the fill values are added as [column]_fill.
 91 | #'
 92 | #' @export
 93 | #' @return The tokenIndex data.table with the annotation columns added
 94 | #' 
 95 | #' @examples 
 96 | #' ## spacy tokens for: Mary loves John, and Mary was loved by John
 97 | #' tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
 98 | #' 
 99 | #' ## two simple example tqueries
100 | #' passive = tquery(pos = "VERB*", label = "predicate",
101 | #'                  children(relation = c("agent"), label = "subject"))
102 | #' active =  tquery(pos = "VERB*", label = "predicate",
103 | #'                  children(relation = c("nsubj", "nsubjpass"), label = "subject"))
104 | #'
105 | #' nodes = apply_queries(tokens, pas=passive, act=active)
106 | #' annotate_nodes(tokens, nodes, 'clause')
107 | annotate_nodes <- function(tokens, nodes, column) {
108 |   if (rsyntax_threads() != data.table::getDTthreads()) {
109 |     old_threads = data.table::getDTthreads()
110 |     on.exit(data.table::setDTthreads(old_threads))
111 |     data.table::setDTthreads(rsyntax_threads())
112 |   }
113 |   
114 |   .FILL_LEVEL = NULL
115 |   tokens = as_tokenindex(tokens)
116 |   if (nrow(nodes) == 0) stop('Cannot annotate nodes, because no nodes are provided')
117 |   if (ncol(nodes) <= 3) stop('Cannot annotate nodes, because no nodes are specified (using the label parameter in find_nodes() or tquery())')
118 |   id_column = paste0(column, '_id')
119 |   fill_column = paste0(column, '_fill')
120 | 
121 |   if (!column %in% colnames(tokens)) tokens[, (column) := factor()]
122 |   if (!id_column %in% colnames(tokens)) tokens[, (id_column) := factor()]
123 |   if (!fill_column %in% colnames(tokens)) tokens[, (fill_column) := double()]
124 |   
125 |   if (nrow(nodes) == 0) {
126 |     return(tokens)
127 |   }
128 |   
129 |   .NODES = prepare_nodes(tokens, nodes) 
130 |   i = tokens[.NODES, on=c('doc_id','sentence','token_id'), which=TRUE]
131 |   
132 |   do_replace = .NODES[i, .FILL_LEVEL] < tokens[i, get(fill_column)]
133 |   replace_row = which(do_replace | is.na(do_replace))
134 |   i = i[replace_row]
135 | 
136 |   tokens[i, (column) := .NODES$.ROLE]
137 |   tokens[i, (id_column) := .NODES$.ID]
138 |   tokens[i, (fill_column) := .NODES$.FILL_LEVEL]
139 |   
140 |   as_tokenindex(tokens)
141 | }
142 | 
143 | 
144 | 
145 | 
146 | #' Transform the nodes to long format and match with token data
147 | #'
148 | #' @param tokens     A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}.
149 | #' @param nodes      A data.table, as created with \link{apply_queries}. Can be a list of multiple data.tables.
150 | #' @param use        Optionally, specify which columns from nodes to add. Other than convenient, this is slighly different 
151 | #'                   from subsetting the columns in 'nodes' beforehand if fill is TRUE. When the children are collected,
152 | #'                   the ids from the not-used columns are still blocked (see 'block')
153 | #' @param token_cols A character vector, specifying which columns from tokens to include in the output
154 | #'
155 | #' @return A data.table with the nodes in long format, and the specified token_cols attached 
156 | #' @export
157 | #' @examples 
158 | #' ## spacy tokens for: Mary loves John, and Mary was loved by John
159 | #' tokens = tokens_spacy[tokens_spacy$doc_id == 'text3',]
160 | #' 
161 | #' ## two simple example tqueries
162 | #' passive = tquery(pos = "VERB*", label = "predicate",
163 | #'                  children(relation = c("agent"), label = "subject"))
164 | #' active =  tquery(pos = "VERB*", label = "predicate",
165 | #'                  children(relation = c("nsubj", "nsubjpass"), label = "subject"))
166 | #'
167 | #' nodes = apply_queries(tokens, pas=passive, act=active)
168 | #' get_nodes(tokens, nodes)
169 | get_nodes <- function(tokens, nodes, use=NULL, token_cols=c('token')) {
170 |   if (rsyntax_threads() != data.table::getDTthreads()) {
171 |     old_threads = data.table::getDTthreads()
172 |     on.exit(data.table::setDTthreads(old_threads))
173 |     data.table::setDTthreads(rsyntax_threads())
174 |   }
175 |   
176 |   tokens = as_tokenindex(tokens)
177 |   
178 |   missing_col = setdiff(token_cols, colnames(tokens))
179 |   if (length(missing_col) > 0) stop(sprintf('columns specified in token_cols arguments not found: %s', paste(missing_col, collapse=', ')))
180 |   
181 |   .NODES = prepare_nodes(tokens, nodes) 
182 |   
183 |   out = merge(.NODES, tokens, by=c('doc_id','sentence','token_id'))
184 |   subset(out, select = c('doc_id','sentence','token_id','.ID','.ROLE', token_cols))
185 | }
186 | 
187 | 
188 | prepare_nodes <- function(tokens, nodes, use=NULL) {
189 |   .ROLE = NULL
190 |   .NODES = data.table::copy(unique(nodes, by = c('doc_id','sentence','token_id')))
191 |   
192 |   data.table::setkeyv(.NODES, c('doc_id','sentence','token_id'))
193 |   if (!is.null(use)) .NODES = subset(.NODES, .ROLE %in% use)
194 |   .NODES
195 | }
196 | 
197 | 
198 | rm_duplicates <- function(nodes) {
199 |   dup = duplicated(nodes, by = c('doc_id','sentence','token_id'))
200 |   dup_id = unique(nodes$.ID[dup])
201 |   subset(nodes, !nodes$.ID %in% dup_id)
202 | }
203 | 


--------------------------------------------------------------------------------
/tests/testthat/test_alpino.r:
--------------------------------------------------------------------------------
  1 | context("Clauses Alpino")
  2 | 
  3 | DUTCH_SAY_VERBS = c("accepteren", "antwoorden", "beamen", "bedenken", "bedoelen", "begrijpen", "bekenen", 
  4 |                     "beklemtonen", "bekrachtigen", "belijden", "beluisteren", "benadruken", "berekenen", "berichten", "beschouwen", "beschrijven", "beseffen", "betuigen", "bevestigen", "bevroeden", 
  5 |                     "beweren", "bewijzen", "bezweren", "biechten", "aan_bieden", "brengen", "brullen", "concluderen", "confirmeren", "constateren", "debiteren", "declareren", "demonstreren", "denken", "uit_dragen", 
  6 |                     "emailen", "erkenen", "expliceren", "expliciteren", "fantaseren", "formuleren", "aan_geven", "geloven", "horen", "hameren", "herinneren", "vol_houden", "aan_kondigen", "kwetteren", 
  7 |                     "toe_lichten", "bekend_maken", "hard_maken", "melden", "merken", "op_merken", "motiveren", "noemen", "nuanceren", "observeren", "onderschrijven", "onderstrepen", "onthullen", "ontsluieren", 
  8 |                     "ontvallen", "ontvouwen", "oordelen", "parafraseren", "postuleren", "preciseren", "presumeren", "pretenderen", "publiceren", "rapporteren", "realiseren", "redeneren", "refereren", 
  9 |                     "rekenen", "roepen", "aan_roeren", "ruiken", "schaten", "schetsen", "schilderen", "schreeuwen", "schrijven", "signaleeren", "snappen", "snateren", "specificeren", "uit_spreken", "staven", "stellen",
 10 |                     "vast_stellen","aan_stippen", "suggereren", "tateren", "aan_tekenen", "aan_tonen", "twitteren", "verbazen", "verhalen", "verklaren", "verklappen", "verkondigen", "vermoeden", "veronderstellen", "verraden", "vertellen", "na_vertellen", 
 11 |                     "verwachten", "verwittigen", "verwonderen", "verzekeren", "vinden", "voelen", "aan_voelen", "waarschuwen", "wedden", "weten", "aan_wijzen", "winden", "zeggen", "uiteen_zetten", "zien")
 12 | 
 13 | alpino_quote_queries <- function(verbs=DUTCH_SAY_VERBS, exclude_verbs=NULL) {
 14 |   # x zegt dat y
 15 |   zegtdat = tquery(label='verb', lemma = verbs,  
 16 |                    children(label = 'source', relation=c('su')),
 17 |                    children(relation='vc', POS = c('C', 'comp'),
 18 |                             children(label= 'quote', relation=c('body'))),
 19 |                    not_parents(lemma=c('kun','moet','zal')))   ## exclude "kun/moet/zal je zeggen dat ..."   
 20 |   
 21 |   # x stelt: y
 22 |   ystelt = tquery(lemma = verbs, 
 23 |                   children(label = 'source', relation=c('su')),
 24 |                   children(label = 'quote', relation='nucl'),
 25 |                   children(lemma =  quote_punctuation))
 26 |   
 27 |   # y, stelt x
 28 |   xstelt = tquery(label='quote', 
 29 |                   custom_fill(NOT(relation='tag')),
 30 |                   children(label='verb', relation='tag', lemma = verbs,
 31 |                            children(label = 'source', relation=c('su'))))
 32 |   
 33 |   # y, volgens x
 34 |   volgens = tquery(label='quote',
 35 |                    children(label='verb', relation=c('mod','tag'), lemma = c('volgens','aldus'),
 36 |                             children(label='source')))
 37 |   
 38 |   # y, zo noemt x het
 39 |   noemt = tquery(label='verb', relation='tag',
 40 |                  children(label='source', relation=c('su')),
 41 |                  parents(label='quote',
 42 |                          children(relation = ' --', lemma = quote_punctuation)))
 43 |   
 44 |   # x is het er ook mee eens: y
 45 |   impliciet = tquery(label='verb',
 46 |                      children(lemma = c('"', "'")),
 47 |                      children(label='quote', relation=c('tag','nucl','sat')),
 48 |                      children(label='source', relation=c('su')))
 49 |   
 50 |   # x: y
 51 |   impliciet2 = tquery(label='source',
 52 |                       children(lemma = ':'),
 53 |                       children(label='quote', relation=c('tag','nucl','sat')),
 54 |                       not_children(relation='su'))
 55 |   
 56 |   # moet/kan/zal zeggen mag wel als eerste persoon is
 57 |   moetzeggen = tquery(label='verb', lemma=c('kunnen','moeten','zullen'), 
 58 |                       children(lemma=verbs,
 59 |                                children(label = 'source', lemma=c('ik','wij'), relation=c('su')), 
 60 |                                children(relation='vc', POS = c('C', 'comp'),
 61 |                                         children(label= 'quote', relation=c('body')))))
 62 |   
 63 |   
 64 |   ## order matters
 65 |   list(zegtdat=zegtdat, ystelt=ystelt, xstelt=xstelt, volgens=volgens, noemt=noemt, 
 66 |        impliciet=impliciet, impliciet2=impliciet2, moetzeggen=moetzeggen)
 67 | }
 68 | 
 69 | alpino_clause_queries <- function(verbs=NULL, exclude_verbs=DUTCH_SAY_VERBS) {
 70 |   
 71 |   passive = tquery(POS = 'verb', NOT(lemma = exclude_verbs), label='predicate',
 72 |                    parents(lemma = c('zijn','worden','hebben')),
 73 |                    children(lemma = c('door','vanwege','omwille'), 
 74 |                             children(label='subject', relation='obj1')))
 75 |   
 76 |   ## [subject] [has/is/etc.] [verb] [object]
 77 |   perfect = tquery(POS = 'verb', NOT(lemma = exclude_verbs), 
 78 |                    parents(label='predicate', lemma = c('zijn','worden','hebben')),
 79 |                    children(label='subject', relation=c('su')))
 80 |   
 81 |   ## [subject] [verb] [object]
 82 |   active = tquery(label='predicate', POS = 'verb', NOT(relation = 'vc', lemma = exclude_verbs),
 83 |                   children(label='subject', relation=c('su')))
 84 |   
 85 |   ## [subject] [verb] 
 86 |   catch_rest = tquery(label='predicate', POS = 'verb', NOT(lemma = exclude_verbs),
 87 |                       children(label='subject', relation=c('su')))
 88 |   
 89 |   list(passive=passive, perfect=perfect, active=active, catch_rest=catch_rest)
 90 | }
 91 | 
 92 | 
 93 | get_quotes <- function(tokens, block=NULL) {
 94 |   queries = alpino_quote_queries()
 95 |   apply_queries(tokens, queries, as_chain=TRUE, block = block, check = FALSE)
 96 | }
 97 | 
 98 | get_clauses <- function(tokens, block=NULL){
 99 |   queries = alpino_clause_queries()
100 |   apply_queries(tokens, queries, as_chain=TRUE, block = block, check = FALSE)
101 | }
102 | 
103 | .check <- function(tokens, nodes, ...) {
104 |   check = list(...)
105 |   for(name in names(check)) {
106 |     expected = as.character(check[[name]])
107 |     actual = get_nodes(tokens, nodes, token_cols = 'token')
108 |     #at(name, ': ', as.character(actual$token[actual$.ROLE == name]), '\n')
109 |     actual = as.character(actual$token[actual$.ROLE == name])
110 |     expect_equal(expected, actual)
111 |   }
112 | }
113 | 
114 | test_that("extracting sources works", {
115 |   tokens = as_tokenindex(tokens_dutchquotes)
116 |   library(testthat)
117 |   #plot_tree(tokens, sentence_i=7, token, allign_text = TRUE)
118 |   
119 |   # 1 : Rutte stelt : " Een stem is verloren " . 
120 |   quotes = get_quotes(tokens[tokens$sentence == 1,])
121 |   expect_equal(nrow(quotes), 5)
122 |   .check(tokens, quotes, source="Rutte", quote=c('Een','stem','is','verloren'))
123 |   
124 |   # 2 : Vooruitblikkend naar de Tweede Kamerverkiezingen van 12 september stelde Rutte : " Een stem op de PVV , is een verloren stem " . 
125 |   quotes = get_quotes(tokens[tokens$sentence == 2,])
126 |   expect_equal(nrow(quotes), 10)
127 |   .check(tokens, quotes, source="Rutte", quote=c('Een','stem','op','de','PVV','is','een','verloren','stem'))
128 |   
129 |   # 3 : " Verkiezingsblabla " , zegt PvdA-Kamerlid Kuiken . 
130 |   quotes = get_quotes(tokens[tokens$sentence == 3,])
131 |   #find_nodes2(tokens[tokens$sentence == 3,], alpino_quote_queries()[[3]])
132 |   #find_nodes(tokens[tokens$sentence == 3,], alpino_quote_queries()[[3]])
133 |   
134 |   expect_equal(nrow(quotes), 8)
135 |   .check(tokens, quotes, source=c("PvdA-Kamerlid","Kuiken"), verb='zegt', quote=c('"','Verkiezingsblabla','"',',','.'))
136 |   
137 |   # 4 : Minister Spies zei dat de PvdA een " Kapitale blunder " had begaan . 
138 |   quotes = get_quotes(tokens[tokens$sentence == 4,])
139 |   expect_equal(nrow(quotes), 14)
140 |   .check(tokens, quotes, source=c("Minister","Spies"), verb=c('zei','dat','"','"','.'), quote=c('de','PvdA','een','Kapitale','blunder','had','begaan'))
141 |   
142 |   # 5 : Begrotingstekort is volgens CPB volgend jaar 3.7 procent . 
143 |   quotes = get_quotes(tokens[tokens$sentence == 5,])
144 |   expect_equal(nrow(quotes), 9)
145 |   .check(tokens, quotes, source="CPB", quote=c('Begrotingstekort','is','volgend','jaar','3.7','procent','.'))
146 |   
147 |   # 6 : Hij veroordeelde het avontuur : " Alsof Nederland een politiek laboratorium is " . 
148 |   quotes = get_quotes(tokens[tokens$sentence == 6,])
149 |   expect_equal(nrow(quotes), 14)
150 |   .check(tokens, quotes, source="Hij", verb=c('veroordeelde','het','avontuur',':','"','"','.'), quote=c('Alsof','Nederland','een','politiek','laboratorium','is'))
151 |   
152 |   # 7 : VVD : Doe recht aan alle werkenden , betaald of onbetaald . 
153 |   quotes = get_quotes(tokens[tokens$sentence == 7,])
154 |   expect_equal(nrow(quotes), 12)
155 |   .check(tokens, quotes, source=c("VVD",':',',','.'), quote=c('Doe','recht','aan','alle','werkenden','betaald','of','onbetaald'))
156 | })
157 | 
158 | 
159 | 
160 | test_that("extracting clauses works", {
161 |   tokens = as_tokenindex(tokens_dutchclauses)
162 | 
163 |   tq = tquery(label='target', 
164 |               children(relation = 'cnj', label='conj'))
165 | 
166 |   find_nodes(tokens[tokens$sentence == 1,], 
167 |              tquery(POS='verb', label='pred',
168 |                    children(relation='su', label='subject',
169 |                             children(relation='cnj', label='conj'))))
170 |   
171 |   clauses = get_clauses(tokens)
172 |   
173 |   # subject and subject -> verb -> object
174 |   clauses = get_clauses(tokens[tokens$sentence == 1,])
175 |   annotate_nodes(tokens[tokens$sentence == 1,], clauses, column='test')
176 |   expect_equal(nrow(clauses), 6)
177 |   .check(tokens, clauses, subject=c('Jantje','en','Piet'), predicate = c('hebben','ruzie','.'))
178 |   
179 |   # subject -> verb -> object (with added nonsense)
180 |   clauses = get_clauses(tokens[tokens$sentence == 2,])
181 |   expect_equal(nrow(clauses), 10)
182 |   .check(tokens, clauses, subject='Jantje', predicate = c('slaat','Piet','op','zijn','hoofd','met','een','hamer','.'))
183 |   
184 |   # passive: subject <- is/becomes verb <- object
185 |   clauses = get_clauses(tokens[tokens$sentence == 3,])
186 |   expect_equal(nrow(clauses), 4)
187 |   .check(tokens, clauses, predicate = c('Piet','geslagen','door'), subject='Jantje')
188 | 
189 |   # passive: object <- is/becomes verb <- subject (complex subject; should this be filtered on nouns?)
190 |   clauses = get_clauses(tokens[tokens$sentence == 4,])
191 |   expect_equal(nrow(clauses), 7)
192 |   .check(tokens, clauses, predicate = c('Piet','geschrokken','door'), subject=c('de',"aanval",'van','Jantje'))
193 |   
194 |   # quote and clause: object says: subject -> verb -> object
195 |   clauses = get_clauses(tokens[tokens$sentence == 5,])
196 |   expect_equal(nrow(clauses), 6)
197 |   .check(tokens, clauses, predicate = c('hem','heel','hard','geslagen','heeft'), subject="Jantje")
198 |   
199 |   # quote, clause and negation: subject says: subject -> dit not verb -> object
200 |   quotes = get_quotes(tokens[tokens$sentence == 6,])
201 |   clauses = get_clauses(tokens[tokens$sentence == 6,])
202 |   expect_equal(nrow(clauses), 6)
203 |   .check(tokens, clauses, subject='ik', predicate = c('heb','Piet','niet','zomaar','geslagen'))
204 |   
205 |   #tokens_dutchclauses %>%
206 |   #  as_tokenindex() %>%
207 |   #  annotate_tqueries('clauses', alpino_clause_queries()) %>%
208 |   #  syntax_reader(annotation='clauses', value='subject')
209 |   
210 | })
211 | 


--------------------------------------------------------------------------------
/R/applied_reshape.r:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #' Split conjunctions for dependency trees in Universal Dependencies 
  4 | #'
  5 | #' @param tokens     a tokenIndex based on texts parsed with \code{\link[spacyr]{spacy_parse}} (with dependency=TRUE)
  6 | #' @param conj_rel   The dependency relation for conjunctions. By default conj 
  7 | #' @param cc_rel     The dependency relation for the coordinating conjunction. By default cc. This will be removed.
  8 | #' @param unpack      If TRUE (default), create separate branches for the parent and the node that inherits the parent position
  9 | #' @param no_fill     Optionally, a character vector with relation types that will be excluded from fill
 10 | #' @param min_dist    Optionally, a minimal distance between the conj node and its parent
 11 | #' @param max_dist    Optionally, a maximum distance between the conj node and its parent
 12 | #' @param right_fill_dist Should fill to the right of the conjunction be used?
 13 | #' @param compound_rel The relation types indicating compounds
 14 | #' @param ...        specify conditions for the conjunction token. For instance, using 'pos = "VERB"' to only split VERB conjunctions.
 15 | #'                   This is especially usefull to use different no_fill conditions.
 16 | #'
 17 | #' @return A tokenindex
 18 | #' @export
 19 | #'
 20 | #' @examples
 21 | #' tokens = tokens_spacy[tokens_spacy$doc_id == 'text5',]
 22 | #' 
 23 | #' if (interactive()) {
 24 | #' tokens %>%
 25 | #'    split_UD_conj() %>%
 26 | #'    plot_tree()
 27 | #' }
 28 | split_UD_conj <- function(tokens, conj_rel='conj', cc_rel=c('cc','cc:preconj'), unpack=T, no_fill=NULL, min_dist=0, max_dist=Inf, right_fill_dist=T, compound_rel = c('compound*','flat'), ...) {
 29 |   conj_max_window = if(right_fill_dist) Inf else 0
 30 |   tq = tquery(label='target', NOT(relation = conj_rel),
 31 |               children(relation = compound_rel, label='ignore', req=FALSE),
 32 |               fill(NOT(relation = no_fill), max_window = c(Inf,0), connected=TRUE),
 33 |               children(relation = conj_rel, label='origin', ..., min_window=c(min_dist,min_dist), max_window = c(max_dist,max_dist),
 34 |                        fill(NOT(relation = no_fill), max_window=c(0,conj_max_window), connected=TRUE)))
 35 |   tokens = climb_tree(tokens, unpack=unpack, tq)
 36 |   if (!is.null(cc_rel)) tokens = chop(tokens, relation = cc_rel)
 37 |   tokens
 38 | }
 39 | 
 40 | #' Have a node adopt its parent's position
 41 | #' 
 42 | #' given a tquery that identfies a node labeled "origin", that has a parent labeled "target", 
 43 | #' recursively have child adopt the parent's position (parent and relation column)
 44 | #' and adopt parents fill nodes. only_new restricts adding fill nodes to relations that child
 45 | #' does not already have. This seems to be a good heuristic for dealing with argument drop
 46 | #'
 47 | #' @param .tokens     A tokenIndex
 48 | #' @param tq          A tquery. Needs to have a node labeled "origin" that has a parent labeled "target" 
 49 | #' @param unpack      If TRUE (default), create separate branches for the parent and the node that inherits the parent position
 50 | #' @param isolate     If unpack is TRUE and isolate is TRUE (default is FALSE), isolate the new branch by recursively unpacking 
 51 | #' @param take_fill   If TRUE (default), give the node that will inherit the parent position a copy of the parent children (but only if it does not already have children with this relation; see only_new)
 52 | #' @param give_fill   If TRUE (default), copy the children of the node that will inherit the parent position to the parent (but only if it does not already have children with this relation; see only_new)
 53 | #' @param only_new    A characetr vector giving one or multiple column names that need to be unique for take_fill and give_fill
 54 | #' @param max_iter    The climb tree function repeatedly resolves the first conjunction it encounters in a sentence. This can lead to many iterations
 55 | #'                    for sentences with many (nested) conjunctions. It could be the case that in unforseen cases or with certain parsers
 56 | #'                    an infinite loop is reached, which is why we use a max_iter argument that breaks the loop and sends a warning if the max is reached.
 57 | #'
 58 | #' @return  The reshaped tokenIndex 
 59 | #' @export
 60 | #' @examples 
 61 | #' 
 62 | #' spacy_conjunctions <- function(tokens) {
 63 | #'   no_fill = c('compound*','case', 'relcl')
 64 | #'   tq = tquery(label='target', NOT(relation = 'conj'),
 65 | #'               rsyntax::fill(NOT(relation = no_fill), max_window = c(Inf,0)),
 66 | #'               children(relation = 'conj', label='origin',
 67 | #'                        rsyntax::fill(NOT(relation = no_fill), max_window=c(0,Inf))))
 68 | #'   tokens = climb_tree(tokens, tq)
 69 | #'   chop(tokens, relation = 'cc')
 70 | #' }
 71 | #' 
 72 | #' ## spacy tokens for "Bob and John ate bread and drank wine"
 73 | #' tokens = tokens_spacy[tokens_spacy$doc_id == 'text5',]
 74 | #'
 75 | #' tokens = spacy_conjunctions(tokens)
 76 | #' 
 77 | #' tokens
 78 | #' \donttest{
 79 | #' if (interactive()) plot_tree(tokens)
 80 | #' }
 81 | climb_tree <- function(.tokens, tq, unpack=TRUE, isolate=TRUE, take_fill=TRUE, give_fill=TRUE, only_new='relation', max_iter=200) {
 82 |   if (rsyntax_threads() != data.table::getDTthreads()) {
 83 |     old_threads = data.table::getDTthreads()
 84 |     on.exit(data.table::setDTthreads(old_threads))
 85 |     data.table::setDTthreads(rsyntax_threads())
 86 |   }
 87 |   
 88 |   target = NULL; new_parent = NULL; tree_parent = NULL
 89 |   i = 1
 90 |   out = list()
 91 |   
 92 |   .tokens = select_nodes(.tokens, tq, fill_only_first = FALSE, .one_per_sentence=TRUE)
 93 |   last_nodes = selected_nodes(.tokens)
 94 |   
 95 |   if (nrow(last_nodes$nodes) == 0) return(.tokens)
 96 | 
 97 |   ## split selected and unselected sentences and store unselected for output
 98 |   filt = unique(last_nodes$nodes[,c('doc_id','sentence')])
 99 |   out[[1]] = .tokens[!filt, on=c('doc_id','sentence')]
100 |   .tokens = .tokens[filt, on=c('doc_id','sentence')]
101 |   data.table::setattr(.tokens, '.nodes', value = last_nodes)  
102 |   
103 |   if (!'tree_parent' %in% colnames(.tokens)) .tokens[, tree_parent := numeric()]
104 |   
105 |   while (TRUE) {
106 |     if (take_fill) .tokens = copy_fill(.tokens, 'target', 'origin', only_new=only_new)
107 |     if (give_fill) .tokens = copy_fill(.tokens, 'origin', 'target', only_new=only_new)
108 |     
109 |     .tokens = mutate_nodes(.tokens, 'origin', parent=target$parent, relation=target$relation, tree_parent=target$tree_parent, tree_relation='conj')
110 |     
111 |     if (unpack) {
112 |       tq2 = tquery(label = 'child', g_id = last_nodes$nodes[,c('doc_id','sentence','origin')],
113 |                    parents(label = 'parent'))
114 |       .tokens = select_nodes(.tokens, tq2, fill_only_first = FALSE) 
115 |       ## copy the parent 
116 |       .tokens = copy_nodes(.tokens, 'parent', 'new_parent', copy_fill = FALSE)
117 |       ## point the duplicate children towards new  copy
118 |       .tokens = mutate_nodes(.tokens, 'child', parent=new_parent$token_id)
119 |       ## and add the parent fill for which relation is not already in copy
120 |       .tokens = copy_fill(.tokens, 'parent', 'new_parent', only_new = 'relation')
121 |       if (isolate) .tokens = resolve_siblings(.tokens)
122 |     }
123 |     
124 |     .tokens = select_nodes(.tokens, tq, fill_only_first = FALSE, .one_per_sentence=TRUE) 
125 |     last_nodes = selected_nodes(.tokens)
126 |     
127 |     if (nrow(last_nodes$nodes) == 0) {
128 |       out[[i+1]] = data.table::copy(.tokens)
129 |       break
130 |     }
131 |     
132 |     filt = unique(last_nodes$nodes[,c('doc_id','sentence')])
133 |     out[[i+1]] = .tokens[!filt, on=c('doc_id','sentence')]
134 |     .tokens = .tokens[filt, on=c('doc_id','sentence')]
135 |     data.table::setattr(.tokens, '.nodes', value = last_nodes)  
136 |     
137 |     i = i + 1
138 |     if (i > max_iter) {
139 |       warning(sprintf('Stopped at iteration %s. See max_iter argument', max_iter))
140 |       break ## this just shouldn't be possible
141 |     }
142 |     
143 |   }
144 |   #.tokens
145 |   as_tokenindex(data.table::rbindlist(out, fill = TRUE))
146 | }
147 | 
148 | 
149 | resolve_siblings <- function(tokens, no_fill=NULL) {
150 |   .SIBLING = target_copy = NULL
151 |   
152 |   ftok = data.table::data.table(doc_id=tokens$doc_id, 
153 |                                 sentence=tokens$sentence, 
154 |                                 floor_token_id=floor(tokens$token_id), 
155 |                                 token_id = tokens$token_id,
156 |                                 parent=tokens$parent)
157 |   dupl = duplicated(ftok, by = c('doc_id','sentence','floor_token_id','parent')) &! is.na(ftok$parent)
158 |   if (!any(dupl)) return(tokens)
159 |   
160 |   sib = tokens[dupl, c('doc_id','sentence','token_id')]
161 |   
162 |   dupl_tok = ftok[ftok[dupl,], on=c('doc_id','sentence','floor_token_id')]
163 |   tokens[,.SIBLING := FALSE]
164 |   tokens[dupl_tok, .SIBLING := TRUE, on=c('doc_id','sentence','token_id')]
165 |   
166 |   tq = tquery(label='target',
167 |               fill(NOT(relation=no_fill), .SIBLING=FALSE, connected=TRUE),
168 |               children(label='origin', g_id=sib[,c('doc_id','sentence','token_id')]))
169 |   
170 |   tokens = select_nodes(tokens, tq)
171 |   tokens = copy_nodes(tokens, 'target', new = 'target_copy', copy_fill = TRUE)
172 |   tokens = mutate_nodes(tokens, 'origin', parent = target_copy$token_id)
173 |   
174 |   ## repeat until no siblings remain
175 |   tokens = resolve_siblings(tokens)
176 |   
177 |   ## backup plan
178 |   #tokens = remove_duplicate_adds(tokens)
179 |   #print(tokens)
180 |   
181 |   if ('.SIBLING' %in% colnames(tokens)) tokens[, .SIBLING := NULL]
182 |   tokens
183 | }
184 | 
185 | remove_duplicate_adds <- function(.tokens) {
186 |   dupl = duplicated(data.table::data.table(doc_id=.tokens$doc_id, 
187 |                                            sentence=.tokens$sentence, 
188 |                                            token_id=floor(.tokens$token_id), 
189 |                                            parent=.tokens$parent))
190 |   dupl = dupl &! is.na(.tokens$parent)
191 |   chop(.tokens, g_id = .tokens[dupl,])
192 | }
193 | 
194 | 
195 | one_per_sentence <- function(.tokens) {
196 |   ## cannot unpack multiple pairs within the same branch, so force unique per sentence
197 |   attr(.tokens, '.nodes')$nodes = attr(.tokens, '.nodes')$nodes[!duplicated(attr(.tokens, '.nodes')$nodes[,c('doc_id','sentence')]),]
198 |   attr(.tokens, '.nodes')$fill = attr(.tokens, '.nodes')$fill[attr(.tokens, '.nodes')$fill$.ID %in% attr(.tokens, '.nodes')$nodes$.ID,]
199 |   .tokens
200 | }
201 | 
202 | 
203 | #' Chop of a branch of the tree
204 | #'
205 | #' Using the query language for tquery, chop of the branch down from the node that is found
206 | #'
207 | #' @param .tokens A tokenIndex
208 | #' @param ... Arguments passed to tquery. For instance, relation = 'punct' cuts off all punctuation dependencies (in universal dependencies)
209 | #'
210 | #' @export
211 | #' @return A tokenIndex with the rows of the nodes in the selected branches removed
212 | #' @examples 
213 | #' 
214 | #' spacy_conjunctions <- function(tokens) {
215 | #'   no_fill = c('compound*','case', 'relcl')
216 | #'   tq = tquery(label='target', NOT(relation = 'conj'),
217 | #'               rsyntax::fill(NOT(relation = no_fill), max_window = c(Inf,0)),
218 | #'               children(relation = 'conj', label='origin',
219 | #'                        rsyntax::fill(NOT(relation = no_fill), max_window=c(0,Inf))))
220 | #'   tokens = climb_tree(tokens, tq)
221 | #'   chop(tokens, relation = 'cc')
222 | #' }
223 | #' 
224 | #' ## spacy tokens for "Bob and John ate bread and drank wine"
225 | #' tokens = tokens_spacy[tokens_spacy$doc_id == 'text5',]
226 | #' 
227 | #' tokens = spacy_conjunctions(tokens)
228 | #' tokens
229 | #' \donttest{
230 | #' if (interactive()) plot_tree(tokens)
231 | #' }
232 | chop <- function(.tokens, ...) {
233 |   if (rsyntax_threads() != data.table::getDTthreads()) {
234 |     old_threads = data.table::getDTthreads()
235 |     on.exit(data.table::setDTthreads(old_threads))
236 |     data.table::setDTthreads(rsyntax_threads())
237 |   }
238 |   
239 |   tq = tquery(..., label = 'chop')
240 |   .tokens = select_nodes(.tokens, tq)
241 |   .tokens = remove_nodes(.tokens, 'chop')
242 |   unselect_nodes(.tokens)
243 | }
244 | 
245 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>",
 11 |   fig.path = "man/figures/README-",
 12 |   out.width = "100%"
 13 | )
 14 | ```
 15 | 
 16 | 
 17 | R functions for working with syntactic structure coded as token lists
 18 | (e.g. CONLL format)
 19 | 
 20 | # Installation
 21 | 
 22 | You can install from CRAN:
 23 | 
 24 | ```{r, eval=FALSE}
 25 | install.packages('rsyntax')
 26 | ```
 27 | 
 28 | Or install the development version from github:
 29 | 
 30 | ```{r, eval=FALSE}
 31 | library(devtools)
 32 | install_github("vanatteveldt/rsyntax")
 33 | ```
 34 | 
 35 | # Tutorial
 36 | 
 37 | For a detailed explanation please see [this working paper](https://github.com/vanatteveldt/rsyntax/blob/master/Querying_dependency_trees.pdf). For a quick and dirty demo, keep on reading.
 38 | 
 39 | ## Preparing the data
 40 | 
 41 | First, we'll need to parse some data. In the working paper we use the spacyr package (for the spaCy parser), but this requires running Python. Another option that does run in native R is the udpipe package (for the UDPipe parser). The following code automatically downloads the English model and applies it to parse the given text.
 42 | 
 43 | ```{r, message=F}
 44 | library(udpipe)
 45 | tokens = udpipe('Mary Jane loves John Smith, and Mary is loved by John', 'english')
 46 | ```
 47 | 
 48 | rsyntax requires the tokens to be in a certain format. The as_tokenindex() function converts a data.frame to this format. For popular parsers in R (spacyr and udpipe) the correct column name specifications are known, so the following is sufficient. 
 49 | 
 50 | ```{r, message=F}
 51 | library(rsyntax)
 52 | tokens = as_tokenindex(tokens)
 53 | ```
 54 | 
 55 | ## Querying the dependency tree
 56 | 
 57 | To query a dependency tree, it is important to have a good understanding of what these trees look like, and how this tree data is represented in a data.frame format. To facilitate this understanding, the plot_tree function visualizes the dependency tree, together with a given selection of columns from the data (see working paper for why this is possible for most types of dependency trees). 
 58 | 
 59 | ```{r, eval=FALSE}
 60 | plot_tree(tokens, token, lemma, upos)
 61 | ```
 62 | 
 63 | ```{r fig1, fig.width=10, fig.height=5, echo=FALSE, message=F}
 64 | plot_tree(tokens, token, lemma, upos, viewer_mode=F)
 65 | ```
 66 | 
 67 | 
 68 | Note that this function only prints one sentence a time, so if the sentence is not specified it uses the first sentence in the data.
 69 | 
 70 | The main functionality of rsyntax is that you can query the dependency tree. While there are several query languages for networks, these are quite complicated and not specialized for querying dependency trees. We therefore developed a new query format that is (supposed to be) easy to understand if you undestand R data.frames. The first step is to create the query using the `tquery` function.
 71 | 
 72 | ### Querying specific nodes
 73 | 
 74 | Firstly, you can provide lookup values for selecting rows from the data.frame. For example, the following query would find all rows where the upos value is either "VERB" or "PROPN": 
 75 | 
 76 | ```{r, eval=FALSE}
 77 | tquery(upos = c("VERB", "PROPN"))
 78 | ```
 79 | 
 80 | ### Querying a pattern of nodes
 81 | 
 82 | To query the edges of a dependency tree, you can perform another row lookup for the parents or children of the results of this query, by nesting the parents() and children() functions. The following query says: for all tokens (i.e. rows) where upos has the value "VERB", find the ones that have a child for which the relation column has the value "nsubj".
 83 | 
 84 | ```{r, eval=FALSE}
 85 | tq = tquery(upos = 'VERB', 
 86 |             children(relation = 'nsubj'))
 87 | ```
 88 | 
 89 | You can look up multiple parents and children, and also nest parents and children within each other to query larger parts of the tree. 
 90 | 
 91 | The above query only finds a match.
 92 | To see which tokens are matched you need to provide labels for the parts of the query that you want to find.
 93 | The following query looks for a simple direct clause with a verb, subject and object.
 94 | 
 95 | ```{r}
 96 | direct = tquery(label = 'verb', upos = 'VERB', 
 97 |                 children(label = 'subject', relation = 'nsubj'),
 98 |                 children(label = 'object', relation = 'obj'))
 99 | ```
100 | 
101 | Specifically this says: find all tokens where upos is "VERB", and that have a child with the relation "nsubj" AND a child with the relation "obj". If this condition is met, give these tokens the labels "verb", "subject" and "object".
102 | 
103 | With the annotate function, we can use this tquery to add these labels to the token data. Here we say that we use the column name "clause" for these labels. 
104 | 
105 | ```{r}
106 | tokens = annotate_tqueries(tokens, 'clause', direct)
107 | 
108 | tokens[,c('doc_id','sentence','token','clause','clause_fill')]
109 | ```
110 | 
111 | In the output we see that "Mary Jane" is labeled as subject, "loves" is labeled as verb, but also that ALL the rest is labeled as object, including ", and Mary is loved by John". 
112 | The reason for this is that by default, rsyntax will label all children of a matched token with the same label.
113 | We call this behavior the "fill" heuristic.
114 | In the clause_fill column you also see at what level a token was matched. The value 0 means the match itself, 1 means a direct child, etc.
115 | The default setting to fill all children is weird in this example, but in the next section we show how this behavior can be customized.
116 | 
117 | ### Using the fill heuristic
118 | 
119 | In our example sentence, we could turn off fill (with the `fill = F` argument) so only John is matched as the object, but a better solution would be to control what specific nodes to fill by nesting the `fill()` function.
120 | For example, we can say that for the subject and object we only want to 'fill' the tokens that form a multiword expression (MWE). In Universal Dependencies this is indicated with the 'flat', 'fixed' and 'compound' relations (see the (Universal Dependencies Relations table)[https://universaldependencies.org/u/dep/]).
121 | Here we use the fill function to specify that we only want to fill tokens where the relation has one of these values. 
122 | Note that specifying lookup values in `fill()` works in the same way as in the `children()` function.
123 | 
124 | ```{r}
125 | fill_mwe = fill(relation = c('flat','fixed','compount'), 
126 |                 connected=T)
127 | ```
128 | 
129 | Next to giving the lookup values for the relation column, we specify that `connected = TRUE`. 
130 | This determines how lookup values are applied for longer branches of children (children -> grandchildren -> etc.).
131 | If connected is TRUE, then whenever a token does not satisfy the lookup values, the tquery will stop looking in this branch.
132 | So, in our current example, if the direct child is not a MWE, the grandchild will not be filled even if it is a MWE.
133 | For multiword expressions this makes sense, because if tokens with 'flat', 'fixed' or 'compound' relations are not directly connected, they are part of different multiword expressions.
134 | 
135 | For reference, if connected is FALSE (which is the default), fill will get all the children, grandchildren, etc., and then filter them based on the lookup values.
136 | 
137 | To use the `fill()` function in a tquery, we simply pass it to one (or multiple) of the labeled nodes, similar to how you would pass the `children` function.
138 | Here we use the `fill_mwe` as specified above for both the subject and object nodes.
139 | Also, we set `fill = F` for the 'verb' node, as an example of how to disable fill for a specific node.
140 | 
141 | ```{r}
142 | direct = tquery(label = 'verb', upos = 'VERB', fill=F,
143 |                 children(label = 'subject', relation = 'nsubj', 
144 |                          fill_mwe),
145 |                 children(label = 'object', relation = 'obj', 
146 |                          fill_mwe))
147 | ```
148 | 
149 | Note that it would also have been possible to directly type this fill() function within the tquery, instead of first assigning it to `fill_mwe`. 
150 | This is a matter of preference, but if you have specific fill settings that you want to use multiple times, the above approach is a good strategy to reduce redundancy in your code.
151 | 
152 | In case you didn't believe us, this actually works.
153 | Here we run the annotate_tqueries function again.
154 | Very importantly, note that we add the `overwrite = TRUE` argument, which means that we'll overwrite the previous "clause" column. (By default, annotate would not overwrite previous results, which enables another way of chaining queries that we won't discuss here.)
155 | 
156 | ```{r}
157 | tokens = annotate_tqueries(tokens, 'clause', direct, overwrite = T)
158 | tokens[,c('doc_id','sentence','token','clause','clause_fill')]
159 | ```
160 | 
161 | 
162 | ### Chaining multiple tqueries
163 | 
164 | Our `direct` tquery does not capture "Mary is loved by John", in which the relation is expressed in a passive form. 
165 | More generally speaking, there are different ways in which people express certain semantic relations in language, so to capture all (or at least most) of them you will have to combine multiple tqueries.
166 | How many queries you'll need depends on what you want to do, but in our experience only a few queries are needed to get good performance on tasks such as quote and clause extraction.
167 | 
168 | For our current example, we only need to add an additional query for subject-verb-object relations in a passive sentence.
169 | Here we again only use a simple version where the subject and obj are explicitly specified. 
170 | Note that we also re-use `fill_mwe` as specified above.
171 | 
172 | ```{r}
173 | passive = tquery(label = 'verb', upos = 'VERB', fill=FALSE,
174 |                  children(label = 'subject', relation = 'obl', fill_mwe),
175 |                  children(label = 'object', relation = 'nsubj:pass', fill_mwe))
176 | ```
177 | 
178 | Now we can add both tqueries to the annotate function. For convenience, we can also specify labels for the queries by passing them as named arguments. Here we label the direct query "dir" and the passive query "pas". Also, note that we again use overwrite = TRUE. 
179 | 
180 | ```{r}
181 | tokens = annotate_tqueries(tokens, 'clause', 
182 |                            dir = direct, 
183 |                            pas = passive, 
184 |                            overwrite = TRUE)
185 | 
186 | tokens[,c('doc_id','sentence','token','clause', 'clause_id')]
187 | ```
188 | 
189 | This time, the sentence has two annotations.
190 | In the clause_id column you can also see that the first one was found with the direct (dir) tquery, and the second one with the passive (pas) tquery.
191 | 
192 | This can also be visualized with the `plot_tree` function.
193 | 
194 | ```{r, eval=FALSE}
195 | plot_tree(tokens, token, lemma, upos, annotation='clause')
196 | ```
197 | 
198 | ```{r, echo=FALSE, message=F, fig.width=10, fig.height=5}
199 | plot_tree(tokens, token, lemma, upos, annotation='clause', viewer_mode=F)
200 | ```
201 | 
202 | 
203 | 
204 | ### Using chaining in a smart way
205 | 
206 | In the current example, there are no nodes that match both queries, but this will often be the case. 
207 | One of the most important features of rsyntax (compared to using more general purpose graph querying languages) is that the 'chaining' of queries is specialised for the task of annotating tokens.
208 | 
209 | When multiple tqueries are passed to `annotate_tqueries`, each token can only be matched once. 
210 | In case multiple queries match the same token, the following rules are applied to determine which query wins.
211 | 
212 | * Queries earlier in the chain have priority.
213 | * Direct matches have priority over fill. So, even if a query earlier in the chain matched certain tokens, the next queries can still use the fill tokens. 
214 | 
215 | This has two important advantages. 
216 | Firstly, allowing tokens to have only one annotation keeps the data.frame nice and tidy, for a happy Hadley.
217 | Secondly, this enables an easy workflow for improving the precision and recall of your annotations.
218 | 
219 | The general idea is to put specific queries (high precision, low recall) at the front of the chain, and broad queries (high recall, low precision) at the end.
220 | If your recall is low, you can add broad queries to the end of the chain.
221 | If there are cases whether a query incorrectly matches a pattern, you can add queries for this specific pattern to the front to increase the precision. 
222 | 
223 | 
224 | # Where to go from here
225 | 
226 | If the quick and dirty tutorial piqued you interest, we recommend reading the working paper for more advanced features and some background on what we ourselves use this package for.
227 | For instance, the rsyntax package also supports more advanced features for writing and piping queries.
228 | Furthermore, since language can get quite complicated (gotta love concatenations, relative clauses and recursion), rsyntax also provides functions for transforming and cutting up dependency trees. 
229 | How to best use this is still something we're experimenting with.
230 | 
231 | Aside from the rsyntax package we will (soon?) create a github repository for an rsyntax cookbook, to share the queries and transformation that we use in our own research. 
232 | If you are interested in using rsyntax and have any questions, concerns or ideas, please do contact us.
233 | 


--------------------------------------------------------------------------------
/R/recursive_search.r:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Recursive search tokens
  4 | #
  5 | # tokens   The tokenIndex
  6 | # ids      A data.table with global ids (doc_id,sentence,token_id). 
  7 | # ql       a list of queriers (possibly the list nested in another query, or containing nested queries)
  8 | # block    A data.table with global ids (doc_id,sentence,token_id) for excluding nodes from the search
  9 | # fill     Include or exclude fill nodes
 10 | # block_loop should nodes found within the loop be added to block?
 11 | rec_find <- function(tokens, ids, ql, block=NULL, fill=TRUE, block_loop=T) {
 12 |   .DROP = NULL
 13 |   out_req = list()
 14 |   out_not_req = list()
 15 |   
 16 |   ## make sure that NOT queries are performed last
 17 |   is_NOT = sapply(ql, function(x) x$NOT)
 18 |   ql = c(ql[!is_NOT], ql[is_NOT])
 19 |   
 20 |   for (i in seq_along(ql)) {
 21 |     q = ql[[i]]
 22 |     if (!fill && methods::is(q, 'tQueryFill')) next   
 23 |     
 24 |     if (is.na(q$label)) {
 25 |       q$label = paste('.DROP', i) ## if label is not used, the temporary .DROP name is used to hold the queries during search. .DROP columns are removed when no longer needed
 26 |     } 
 27 |     
 28 |     if (q$NOT) {
 29 |       selection = rec_selection(tokens, ids, q, NULL, fill)
 30 |     } else {
 31 |       selection = rec_selection(tokens, ids, q, block, fill)
 32 |     }
 33 |    
 34 |   
 35 |     if (q$NOT) {
 36 |       if (nrow(selection) > 0) {
 37 |         selection = data.table::fsetdiff(data.table::data.table(ids[,1],ids[,2], .MATCH_ID=ids[[3]]), selection[,c('doc_id','sentence','.MATCH_ID')])
 38 |       } else selection = data.table::data.table(ids[,1], ids[,2], .MATCH_ID=ids[[3]])
 39 |     }
 40 |     
 41 |     if (q$req) {
 42 |       if (nrow(selection) == 0) return(selection)
 43 |       out_req[['']] = selection
 44 |       
 45 |       dropcols = grep('.DROP.*', colnames(selection), value=TRUE)
 46 |       if (length(dropcols) > 0) selection[, (dropcols) := NULL]
 47 |       
 48 |       if (block_loop) 
 49 |         block = get_long_ids(block, selection)
 50 |       
 51 |     } else {
 52 |       out_not_req[['']] = selection
 53 |     }
 54 |   }
 55 |   
 56 |   has_req = length(out_req) > 0  
 57 |   has_not_req = length(out_not_req) > 0  
 58 |   
 59 |   
 60 |   if (has_req && !has_not_req) 
 61 |     out = merge_req(out_req)
 62 |   if (!has_req && has_not_req)
 63 |     out = merge_not_req(ids, out_not_req)
 64 |   if (has_req && has_not_req)
 65 |     out = merge_req_and_not_req(ids, out_req, out_not_req)
 66 |   if (!has_req && !has_not_req)
 67 |     out = data.table::data.table()
 68 |   
 69 |   out
 70 | }
 71 | 
 72 | rec_selection <- function(tokens, ids, q, block, fill) {
 73 |   
 74 |   selection = select_tokens(tokens, ids=ids, q=q, block=block)
 75 |   
 76 |   if (length(q$nested) > 0 & length(selection) > 0) {
 77 |     nested = rec_find(tokens, ids=selection[,c('doc_id','sentence',q$label),with=FALSE], ql=q$nested, block=block, fill=fill) 
 78 |     ## The .MATCH_ID column in 'nested' is used to match nested results to the token_id of the current level (stored under the label column)
 79 |     is_req = any(sapply(q$nested, function(x) x$req))
 80 |     
 81 |     if (nrow(nested) > 0) {
 82 |       if (is_req) {
 83 |         selection = merge(selection, nested, by.x=c('doc_id','sentence',q$label), by.y=c('doc_id','sentence','.MATCH_ID'), allow.cartesian=TRUE) 
 84 |       } else {
 85 |       selection = merge(selection, nested, by.x=c('doc_id','sentence',q$label), by.y=c('doc_id','sentence','.MATCH_ID'), allow.cartesian=TRUE, all.x=TRUE) 
 86 |       }
 87 |     } else {
 88 |       if (is_req) selection = data.table::data.table(.MATCH_ID = numeric(), doc_id=numeric(), sentence=numeric(), .DROP = numeric())
 89 |     }
 90 |   } 
 91 |   data.table::setkeyv(selection, c('doc_id','sentence','.MATCH_ID'))
 92 |   selection
 93 | }
 94 | 
 95 | ## merge a list of results where all results are required: req[[1]] AND req[[2]] AND etc.
 96 | merge_req <- function(req) {
 97 |   out = data.table::data.table()
 98 |   if (any(sapply(req, nrow) == 0)) return(out)
 99 |   for (dt in req) {
100 |     out = if (nrow(out) == 0) dt else merge(out, dt, by=c('doc_id','sentence','.MATCH_ID'), allow.cartesian=TRUE)
101 |   }
102 |   out
103 | }
104 | 
105 | ## merge a list of results where results are not required: req[[1]] OR req[[2]] OR etc.
106 | merge_not_req <- function(ids, not_req) {
107 |   out = data.table::data.table()
108 |   for (dt in not_req) {
109 |     out = if (nrow(out) == 0) dt else merge(out, dt, by=c('doc_id','sentence','.MATCH_ID'), allow.cartesian=TRUE, all=TRUE)
110 |   }
111 |   merge(out, ids, by.x=c('doc_id','sentence','.MATCH_ID'), by.y=colnames(ids), allow.cartesian=TRUE, all.y=TRUE)
112 | }
113 | 
114 | ## merge req and not_req results: (req[[1]] AND req[[2]] AND etc.) AND (not_req[[1]] OR not_req[[2]] OR etc).)
115 | merge_req_and_not_req <- function(ids, req, not_req) {
116 |   
117 |   out = merge_req(req)
118 |   if (nrow(out) > 0) {
119 |     out_not_req = merge_not_req(ids, not_req)
120 |     if (nrow(out_not_req) > 0) {
121 |       out = merge(out, out_not_req, by=c('doc_id','sentence','.MATCH_ID'), allow.cartesian=TRUE, all.x=TRUE)
122 |     }
123 |   }
124 |   out
125 | }
126 | 
127 | # Select which tokens to add
128 | # 
129 | # Given ids, look for their parents/children (specified in query q) and filter on criteria (specified in query q)
130 | #
131 | # tokens   The tokenIndex
132 | # ids      A data.table with global ids (doc_id,sentence,token_id). 
133 | # q        a query (possibly nested in another query, or containing nested queries)
134 | # block    A data.table with global ids (doc_id,sentence,token_id) for excluding nodes from the search
135 | select_tokens <- function(tokens, ids, q, block=NULL) {
136 |   .MATCH_ID = NULL ## bindings for data.table
137 |   selection = select_token_family(tokens, ids, q, block)
138 |   if (!grepl('_FILL', q$label, fixed=TRUE)) {
139 |     selection = subset(selection, select=c('.MATCH_ID', 'doc_id','sentence','token_id'))
140 |     data.table::setnames(selection, 'token_id', q$label)
141 |   } else {
142 |     selection = subset(selection, select=c('.MATCH_ID', 'doc_id','sentence','.FILL_LEVEL','token_id'))
143 |     data.table::setnames(selection, c('token_id','.FILL_LEVEL'), c(q$label, paste0(q$label, '_LEVEL')))
144 |   }
145 |   
146 |   if (nrow(selection) > 0 && !identical(q$max_window, c(Inf,Inf))) {
147 |     dist = selection[[q$label]] - selection$.MATCH_ID
148 |     distfilter = dist >= (-q$max_window[1]) & dist <= q$max_window[2]
149 |     selection = selection[distfilter,]
150 |   }
151 |   if (nrow(selection) > 0 && !identical(q$min_window, c(0,0))) {
152 |     dist = selection[[q$label]] - selection$.MATCH_ID
153 |     distfilter = dist <= (-q$min_window[1]) | dist >= q$min_window[2]
154 |     selection = selection[distfilter,]
155 |   }
156 |   selection
157 | }
158 | 
159 | select_token_family <- function(tokens, ids, q, block) {
160 |   if (q$connected) {
161 |     selection = token_family(tokens, ids=ids, level=q$level, depth=q$depth, block=block, replace=TRUE, show_level = TRUE, lookup=q$lookup, g_id=q$g_id)
162 |   } else {
163 |     selection = token_family(tokens, ids=ids, level=q$level, depth=q$depth, block=block, replace=TRUE, show_level = TRUE, lookup=q$BREAK)
164 |     if (!data.table::haskey(selection)) data.table::setkeyv(selection, c('doc_id','sentence','token_id'))
165 |     selection = filter_tokens(selection, q$lookup, .G_ID = q$g_id)
166 |   }
167 |   selection
168 | }
169 | 
170 | 
171 | # Get the parents or children of a set of ids
172 | #
173 | # tokens   The tokenIndex
174 | # ids      A data.table with global ids (doc_id,sentence,token_id). 
175 | # level    either 'children' or 'parents'
176 | # depth    How deep to search. eg. children -> grandchildren -> grandgrand etc.
177 | # minimal  If TRUE, only return doc_id, sentence, token_id and parent
178 | # block    A data.table with global ids (doc_id,sentence,token_id) for excluding nodes from the search
179 | # replace  If TRUE, re-use nodes in deep_family() 
180 | # show_level If TRUE, add a column showing the level (depth in tree) of a node.
181 | # lookup   filter tokens by lookup values. Will be applied at each level (if depth > 1) 
182 | # g_id     filter tokens by id. See lookup
183 | token_family <- function(tokens, ids, level='children', depth=Inf, minimal=FALSE, block=NULL, replace=FALSE, show_level=FALSE, lookup=NULL, g_id=NULL) {
184 |   .MATCH_ID = NULL
185 |   
186 |   if (!replace) block = get_long_ids(ids, block)
187 |   
188 |   if ('.MATCH_ID' %in% colnames(tokens)) tokens[, .MATCH_ID := NULL]
189 |   
190 |   if (level == 'children') {
191 |     id = tokens[list(ids[[1]], ids[[2]], ids[[3]]), on=c('doc_id','sentence','parent'), nomatch=0, allow.cartesian=TRUE]
192 |     id = filter_tokens(id, .BLOCK=block, lookup=lookup, .G_ID = g_id)
193 |     if (minimal) id = subset(id, select = c('doc_id','sentence','token_id','parent'))
194 |     data.table::set(id, j = '.MATCH_ID', value = id[['parent']])
195 |   }
196 |   if (level == 'parents') {
197 |     .NODE = filter_tokens(tokens, .G_ID = ids)
198 |     .NODE = subset(.NODE, select=c('doc_id','sentence','parent','token_id'))
199 |     
200 |     data.table::setnames(.NODE, old='token_id', new='.MATCH_ID')
201 |     id = filter_tokens(tokens, .G_ID = .NODE[,c('doc_id','sentence','parent')], .BLOCK=block)
202 |     id = filter_tokens(id, .G_ID = g_id, lookup=lookup)
203 |     
204 |     if (minimal) id = subset(id, select = c('doc_id','sentence','token_id','parent'))
205 |     id = merge(id, .NODE, by.x=c('doc_id','sentence','token_id'), by.y=c('doc_id','sentence','parent'), allow.cartesian=TRUE)
206 |   }
207 |   if (depth > 1) id = deep_family(tokens, id, level, depth, minimal=minimal, block=block, replace=replace, show_level=show_level, lookup=lookup, g_id=g_id) 
208 |   if (depth <= 1 && show_level) id = cbind(.FILL_LEVEL=as.double(rep(1,nrow(id))), id)
209 |   id
210 | }
211 | 
212 | # Get the parents or children of a set of ids
213 | #
214 | # tokens   The tokenIndex
215 | # id       rows in the tokenIndex to use as the ID (for whom to get the family)
216 | # level    either 'children' or 'parents'
217 | # depth    How deep to search. eg. children -> grandchildren -> grandgrand etc.
218 | # minimal  If TRUE, only return doc_id, sentence, token_id and parent
219 | # block    A data.table with global ids (doc_id,sentence,token_id) for excluding nodes from the search
220 | # replace  If TRUE, re-use nodes 
221 | # show_level If TRUE, return a column with the level at which the node was found (e.g., as a parent, grantparent, etc.)
222 | # only_new If TRUE, only return new found family. Otherwise, the id input is included as well.
223 | # lookup   Optional lookup filter
224 | # g_id     Optional filter with specific global token ids
225 | deep_family <- function(tokens, id, level, depth, minimal=FALSE, block=NULL, replace=FALSE, show_level=FALSE, only_new=FALSE, lookup=NULL, g_id=NULL) {
226 |   id_list = vector('list', 10) ## 10 is just for reserving (more than sufficient) items. R will automatically add more if needed (don't think this actually requires reallocation).
227 |   id_list[[1]] = id
228 |   i = 2
229 |   
230 |   ilc = data.table()  ## infinite loop catcher. Because some parsers have loops...
231 |   safety_depth = max(tokens$token_id) + 1
232 |   while (i <= depth) {
233 |     if (i == safety_depth) {
234 |       warning(sprintf('Safety depth threshold was reached (max token_id), which probably indicates an infinite loop in deep_family(), which shouldnt happen. Please make a GitHub issue if you see this'))
235 |       break
236 |     }
237 |     .NODE = id_list[[i-1]]
238 |     
239 |     if (nrow(ilc) > 0) {
240 |       .NODE = .NODE[!list(ilc$doc_id, ilc$sentence, ilc$token_id, ilc$.MATCH_ID), on=c('doc_id','sentence','token_id','.MATCH_ID')]
241 |     }
242 |     ilc = rbind(ilc, subset(.NODE, select = c('doc_id','sentence','token_id','.MATCH_ID')))
243 |     
244 |     if (!replace) block = get_long_ids(block, .NODE[,c('doc_id','sentence','token_id'), with=FALSE])
245 |     #print(.NODE)
246 |     #.NODE = subset(.NODE, .NODE$token_id %in% unique(.NODE$parent)) ## prevent infinite loops
247 |     
248 |     if (level == 'children') {
249 |       id = filter_tokens(tokens, .G_PARENT = .NODE[,c('doc_id','sentence','token_id')], .BLOCK=block, lookup=lookup, .G_ID=g_id)
250 |       id = merge(id, subset(.NODE, select = c('doc_id','sentence','token_id','.MATCH_ID')), by.x=c('doc_id','sentence','parent'), by.y=c('doc_id','sentence','token_id'), allow.cartesian=TRUE)
251 |       id_list[[i]] = if (minimal) subset(id, select = c('doc_id','sentence','token_id','parent','.MATCH_ID')) else id
252 |     }
253 |     
254 |     if (level == 'parents') {
255 |       id = filter_tokens(tokens, .G_ID = .NODE[,c('doc_id','sentence','parent')], .BLOCK=block)
256 |       id = filter_tokens(id, .G_ID = g_id, lookup=lookup)
257 |       id = merge(id, subset(.NODE, select = c('doc_id','sentence','parent', '.MATCH_ID')), by.x=c('doc_id','sentence','token_id'), by.y=c('doc_id','sentence','parent'), allow.cartesian=TRUE)
258 |       id_list[[i]] = if (minimal) subset(id, select = c('doc_id','sentence','token_id','parent','.MATCH_ID')) else id
259 |    }
260 |     if (nrow(id_list[[i]]) == 0) break
261 |     i = i + 1
262 |   }
263 |   
264 |   if (only_new) id_list[[1]] = NULL
265 |   if (show_level) {
266 |     id_list = id_list[!sapply(id_list, is.null)]   ## in older version of data.table R breaks (badly) if idcol is used in rbindlist with NULL values in list
267 |     out = data.table::rbindlist(id_list, use.names = TRUE, idcol = '.FILL_LEVEL')
268 |     out$.FILL_LEVEL = as.double(out$.FILL_LEVEL)
269 |     return(out)
270 |   } else {
271 |     return(data.table::rbindlist(id_list, use.names = TRUE))
272 |   }
273 | }
274 | 


--------------------------------------------------------------------------------