├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── LICENSE.md
├── NAMESPACE
├── R
    ├── albert-embeddings.R
    ├── albert-for-token-classification.R
    ├── annotate.R
    ├── annotation.R
    ├── annotation_tool_json_reader.R
    ├── assertion_dl.R
    ├── assertion_filterer.R
    ├── assertion_logreg.R
    ├── bert-embeddings.R
    ├── bert-for-token-classification.R
    ├── bert_sentence_chunk_embeddings.R
    ├── bert_sentence_embeddings.R
    ├── chunk-embeddings.R
    ├── chunk2doc.R
    ├── chunk2token.R
    ├── chunk_entity_resolver.R
    ├── chunk_filterer.R
    ├── chunker.R
    ├── classifier_dl.R
    ├── context-spell-checker.R
    ├── contextual_parser.R
    ├── date-matcher.R
    ├── date_normalizer.R
    ├── dependencies.R
    ├── dependency-parser.R
    ├── distilbert-embeddings.R
    ├── distilbert-for-token-classification.R
    ├── doc2chunk.R
    ├── document-assembler.R
    ├── document_logreg_classifier.R
    ├── document_normalizer.R
    ├── drug_normalizer.R
    ├── elmo-embeddings.R
    ├── embeddings_finisher.R
    ├── entity_ruler.R
    ├── finisher.R
    ├── graph-extraction.R
    ├── graph-finisher.R
    ├── java.R
    ├── language-detector-dl.R
    ├── lemmatizer.R
    ├── light-pipeline.R
    ├── longformer-embeddings.R
    ├── longformer-for-token-classification.R
    ├── marian_transformer.R
    ├── medical-ner.R
    ├── multi_classifier_dl.R
    ├── multi_date-matcher.R
    ├── ner-converter-internal.R
    ├── ner-converter.R
    ├── ner-crf.R
    ├── ner-dl.R
    ├── ner_chunker.R
    ├── ngram-generator.R
    ├── normalizer.R
    ├── norvig-spell-checker.R
    ├── perceptron.R
    ├── pretrained-pipeline.R
    ├── re_ner_chunks_filter.R
    ├── recursive-pipeline.R
    ├── recursive-tokenizer.R
    ├── regex_matcher.R
    ├── relation_extraction.R
    ├── relation_extraction_dl.R
    ├── resource_downloader.R
    ├── roberta-embeddings.R
    ├── roberta-for-token-classification.R
    ├── roberta_sentence_embeddings.R
    ├── sentence-detector.R
    ├── sentence-embeddings.R
    ├── sentence_detector_dl.R
    ├── sentence_entity_resolver.R
    ├── sentiment-detector.R
    ├── sentiment-dl.R
    ├── stemmer.R
    ├── stop_words_cleaner.R
    ├── symmetric-delete.R
    ├── t5_transformer.R
    ├── text-matcher.R
    ├── token-assembler.R
    ├── tokenizer.R
    ├── typed-dependency-parser.R
    ├── univ_sent_encoder.R
    ├── utils.R
    ├── vivekn-sentiment-detector.R
    ├── word-embeddings.R
    ├── xlm-roberta-embeddings.R
    ├── xlm_roberta-for-token-classification.R
    ├── xlm_roberta_sentence_embeddings.R
    ├── xlnet-embeddings.R
    ├── xlnet-for-token-classification.R
    └── yake_model.R
├── README.md
├── configure.R
├── examples
    ├── annotation
    │   ├── Pretrained-MatchPattern-Pipeline.Rmd
    │   ├── entities.txt
    │   └── extractor.Rmd
    ├── quick-start.Rmd
    ├── training
    │   ├── Classifier_DL_Train_multi_class_news_category_classifier.Rmd
    │   ├── Classifier_DL_Train_multi_class_news_category_classifier.nb.html
    │   ├── ViveknNarayanSentimentApproach.Rmd
    │   ├── ner_dl.Rmd
    │   └── ner_dl_crf.Rmd
    └── tutorials
    │   ├── 5- How to use Spark NLP and Spark ML Pipelines.Rmd
    │   └── certification_trainings
    │       ├── 1.SparkNLP_Basics.Rmd
    │       ├── 2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.Rmd
    │       ├── 3.SparkNLP_Pretrained_Models.Rmd
    │       ├── 4.NERDL_Training.Rmd
    │       ├── 5.1_Text_classification_examples_in_SparkML_SparkNLP.Rmd
    │       ├── 5.Text_Classification_with_ClassifierDL.Rmd
    │       ├── 6.Playground_DataFrames.Rmd
    │       ├── english_models.png
    │       ├── nerdl-model.png
    │       ├── pipeline.png
    │       ├── pre-trained-pipelines.png
    │       └── stacked_pipeline.png
├── inst
    ├── java
    │   ├── sparknlp-2.4-2.11.jar
    │   ├── sparknlp-3.0-2.12.jar
    │   └── sparknlp-3.1-2.12.jar
    └── sparkml
    │   ├── class_mapping.json
    │   └── param_mapping.json
├── java
    └── main.scala
├── man-roxygen
    ├── roxlate-input-output-params.R
    ├── roxlate-input-outputs-params.R
    ├── roxlate-inputs-output-params.R
    ├── roxlate-inputs-outputs-params.R
    ├── roxlate-nlp-algo.R
    ├── roxlate-nlp-ml-algo.R
    ├── roxlate-nlp-transformer.R
    └── roxlate-pretrained-params.R
├── man
    ├── as_pipeline_model.Rd
    ├── nlp_albert_embeddings_pretrained.Rd
    ├── nlp_albert_token_classification_pretrained.Rd
    ├── nlp_annotate.Rd
    ├── nlp_annotate_full.Rd
    ├── nlp_annotation.Rd
    ├── nlp_annotation_read_dataset.Rd
    ├── nlp_annotation_tool_json_reader.Rd
    ├── nlp_assertion_dl.Rd
    ├── nlp_assertion_dl_pretrained.Rd
    ├── nlp_assertion_filterer.Rd
    ├── nlp_assertion_logreg.Rd
    ├── nlp_assertion_logreg_pretrained.Rd
    ├── nlp_bert_embeddings_pretrained.Rd
    ├── nlp_bert_sentence_chunk_embeddings_pretrained.Rd
    ├── nlp_bert_sentence_embeddings_pretrained.Rd
    ├── nlp_bert_token_classification_pretrained.Rd
    ├── nlp_chunk2doc.Rd
    ├── nlp_chunk2token.Rd
    ├── nlp_chunk_embeddings.Rd
    ├── nlp_chunk_entity_resolver.Rd
    ├── nlp_chunk_entity_resolver_pretrained.Rd
    ├── nlp_chunk_filterer.Rd
    ├── nlp_chunker.Rd
    ├── nlp_classifier_dl.Rd
    ├── nlp_classifier_dl_pretrained.Rd
    ├── nlp_conll_read_dataset.Rd
    ├── nlp_conllu_read_dataset.Rd
    ├── nlp_context_spell_checker.Rd
    ├── nlp_context_spell_checker_pretrained.Rd
    ├── nlp_contextual_parser.Rd
    ├── nlp_date_matcher.Rd
    ├── nlp_date_normalizer.Rd
    ├── nlp_dependency_parser.Rd
    ├── nlp_dependency_parser_pretrained.Rd
    ├── nlp_distilbert_embeddings_pretrained.Rd
    ├── nlp_distilbert_token_classification_pretrained.Rd
    ├── nlp_doc2chunk.Rd
    ├── nlp_document_assembler.Rd
    ├── nlp_document_logreg_classifier.Rd
    ├── nlp_document_normalizer.Rd
    ├── nlp_drug_normalizer.Rd
    ├── nlp_elmo_embeddings_pretrained.Rd
    ├── nlp_embeddings_finisher.Rd
    ├── nlp_entity_ruler.Rd
    ├── nlp_finisher.Rd
    ├── nlp_generate_assertion_train_set.Rd
    ├── nlp_generate_colln.Rd
    ├── nlp_generate_plain_assertion_train_set.Rd
    ├── nlp_get_classes.Rd
    ├── nlp_graph_extraction.Rd
    ├── nlp_graph_finisher.Rd
    ├── nlp_language_detector_dl_pretrained.Rd
    ├── nlp_lemmatizer.Rd
    ├── nlp_lemmatizer_pretrained.Rd
    ├── nlp_light_pipeline.Rd
    ├── nlp_longformer_embeddings_pretrained.Rd
    ├── nlp_longformer_token_classification_pretrained.Rd
    ├── nlp_marian_transformer.Rd
    ├── nlp_marian_transformer_pretrained.Rd
    ├── nlp_medical_ner.Rd
    ├── nlp_medical_ner_pretrained.Rd
    ├── nlp_multi_classifier_dl.Rd
    ├── nlp_multi_classifier_dl_pretrained.Rd
    ├── nlp_multi_date_matcher.Rd
    ├── nlp_ner_chunker.Rd
    ├── nlp_ner_converter.Rd
    ├── nlp_ner_converter_internal.Rd
    ├── nlp_ner_crf.Rd
    ├── nlp_ner_crf_pretrained.Rd
    ├── nlp_ner_dl.Rd
    ├── nlp_ner_dl_pretrained.Rd
    ├── nlp_ngram_generator.Rd
    ├── nlp_normalizer.Rd
    ├── nlp_norvig_spell_checker.Rd
    ├── nlp_norvig_spell_checker_pretrained.Rd
    ├── nlp_perceptron.Rd
    ├── nlp_perceptron_pretrained.Rd
    ├── nlp_pos.Rd
    ├── nlp_pretrained_pipeline.Rd
    ├── nlp_pubtator_read_dataset.Rd
    ├── nlp_re_ner_chunks_filter.Rd
    ├── nlp_recursive_pipeline.Rd
    ├── nlp_recursive_tokenizer.Rd
    ├── nlp_regex_matcher.Rd
    ├── nlp_relation_extraction.Rd
    ├── nlp_relation_extraction_dl.Rd
    ├── nlp_relation_extraction_dl_pretrained.Rd
    ├── nlp_relation_extraction_pretrained.Rd
    ├── nlp_resource_downloader.Rd
    ├── nlp_roberta_embeddings_pretrained.Rd
    ├── nlp_roberta_sentence_embeddings_pretrained.Rd
    ├── nlp_roberta_token_classification_pretrained.Rd
    ├── nlp_sentence_detector.Rd
    ├── nlp_sentence_detector_dl.Rd
    ├── nlp_sentence_detector_dl_pretrained.Rd
    ├── nlp_sentence_embeddings.Rd
    ├── nlp_sentence_entity_resolver.Rd
    ├── nlp_sentence_entity_resolver_pretrained.Rd
    ├── nlp_sentiment_detector.Rd
    ├── nlp_sentiment_dl.Rd
    ├── nlp_sentiment_dl_pretrained.Rd
    ├── nlp_set_input_cols.Rd
    ├── nlp_set_output_col.Rd
    ├── nlp_set_param.Rd
    ├── nlp_set_param_tuple2.Rd
    ├── nlp_spark_annotation.Rd
    ├── nlp_stemmer.Rd
    ├── nlp_stop_words_cleaner.Rd
    ├── nlp_symmetric_delete.Rd
    ├── nlp_symmetric_delete_pretrained.Rd
    ├── nlp_t5_transformer.Rd
    ├── nlp_t5_transformer_pretrained.Rd
    ├── nlp_text_matcher.Rd
    ├── nlp_token_assembler.Rd
    ├── nlp_tokenizer.Rd
    ├── nlp_typed_dependency_parser.Rd
    ├── nlp_typed_dependency_parser_pretrained.Rd
    ├── nlp_univ_sent_encoder.Rd
    ├── nlp_univ_sent_encoder_pretrained.Rd
    ├── nlp_version.Rd
    ├── nlp_vivekn_sentiment_detector.Rd
    ├── nlp_vivekn_sentiment_pretrained.Rd
    ├── nlp_word_embeddings.Rd
    ├── nlp_word_embeddings_model.Rd
    ├── nlp_word_embeddings_pretrained.Rd
    ├── nlp_xlm_roberta_embeddings_pretrained.Rd
    ├── nlp_xlm_roberta_sentence_embeddings_pretrained.Rd
    ├── nlp_xlm_roberta_token_classification_pretrained.Rd
    ├── nlp_xlnet_embeddings_pretrained.Rd
    ├── nlp_xlnet_token_classification_pretrained.Rd
    ├── nlp_yake_model.Rd
    └── set_nlp_version.Rd
├── sparknlp.Rproj
└── tests
    ├── testthat.R
    └── testthat
        ├── .gitignore
        ├── data
            ├── .gitignore
            ├── AskAPatient.fold-0.test.txt
            ├── corpus_pubtator_sample.txt
            ├── crf-eng.train.small
            ├── dependency_treebank
            │   └── wsj_0001.dp
            ├── e2e.csv
            ├── en.test.conllu
            ├── eng.testa.conll
            ├── entities.txt
            ├── entity_ruler
            │   └── patterns.csv
            ├── gender.csv
            ├── gender.json
            ├── i2b2_assertion_sample.csv
            ├── lemmas_small.txt
            ├── pos_corpus.txt
            ├── random_embeddings_dim4.txt
            ├── re_train.parquet
            ├── regex_match.txt
            ├── result.json
            ├── sentiment.csv
            ├── sentiment.parquet
            │   ├── ._SUCCESS.crc
            │   ├── .part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet.crc
            │   ├── _SUCCESS
            │   └── part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet
            ├── sentiment_dictionary.txt
            ├── train.conll2009.txt
            └── words.txt
        ├── helper-initialize.R
        ├── testthat-albert-embeddings.R
        ├── testthat-albert-for-token-classification.R
        ├── testthat-annotate.R
        ├── testthat-annotation_tool_json_reader.R
        ├── testthat-assertion_dl.R
        ├── testthat-assertion_filterer.R
        ├── testthat-assertion_logreg.R
        ├── testthat-bert-embeddings.R
        ├── testthat-bert-for-token-classification.R
        ├── testthat-bert_sentence_chunk_embeddings.R
        ├── testthat-bert_sentence_embeddings.R
        ├── testthat-chunk-embeddings.R
        ├── testthat-chunk2doc.R
        ├── testthat-chunk2token.R
        ├── testthat-chunk_entity_resolver.R
        ├── testthat-chunk_filterer.R
        ├── testthat-chunker.R
        ├── testthat-classifier_dl.R
        ├── testthat-context-spell-checker.R
        ├── testthat-contextual_parser.R
        ├── testthat-date-matcher.R
        ├── testthat-date_normalizer.R
        ├── testthat-dependency-parser.R
        ├── testthat-distilbert-embeddings.R
        ├── testthat-distilbert-for-token-classification.R
        ├── testthat-doc2chunk.R
        ├── testthat-document-assembler.R
        ├── testthat-document_logreg_classifier.R
        ├── testthat-document_normalizer.R
        ├── testthat-drug_normalizer.R
        ├── testthat-elmo-embeddings.R
        ├── testthat-embeddings_finisher.R
        ├── testthat-entity_ruler.R
        ├── testthat-finisher.R
        ├── testthat-graph-extraction.R
        ├── testthat-graph-finisher.R
        ├── testthat-language-detector-dl.R
        ├── testthat-lemmatizer.R
        ├── testthat-light-pipeline.R
        ├── testthat-longformer-embeddings.R
        ├── testthat-longformer-for-token-classification.R
        ├── testthat-marian_transformer.R
        ├── testthat-medical-ner.R
        ├── testthat-multi-date-matcher.R
        ├── testthat-multi_classifier_dl.R
        ├── testthat-ner-converter.R
        ├── testthat-ner-converter_internal.R
        ├── testthat-ner-crf.R
        ├── testthat-ner-dl.R
        ├── testthat-ner_chunker.R
        ├── testthat-ngram-generator.R
        ├── testthat-normalizer.R
        ├── testthat-norvig-spell-checker.R
        ├── testthat-perceptron.R
        ├── testthat-pretrained-pipeline.R
        ├── testthat-pubtator.R
        ├── testthat-re_ner_chunks_filter.R
        ├── testthat-recursive-pipeline.R
        ├── testthat-recursive-tokenizer.R
        ├── testthat-regex-matcher.R
        ├── testthat-relation_extraction.R
        ├── testthat-relation_extraction_dl.R
        ├── testthat-roberta-embeddings.R
        ├── testthat-roberta-for-token-classification.R
        ├── testthat-roberta_sentence_embeddings.R
        ├── testthat-sentence-detector.R
        ├── testthat-sentence-embeddings.R
        ├── testthat-sentence_detector_dl.R
        ├── testthat-sentence_entity_resolver.R
        ├── testthat-sentiment-detector.R
        ├── testthat-sentiment-dl.R
        ├── testthat-stemmer.R
        ├── testthat-stop_words_cleaner.R
        ├── testthat-symmetric-delete.R
        ├── testthat-t5_transformer.R
        ├── testthat-text-matcher.R
        ├── testthat-token-assembler.R
        ├── testthat-tokenizer.R
        ├── testthat-typed-dependency-parser.R
        ├── testthat-univ_sent_encoder.R
        ├── testthat-utils.R
        ├── testthat-vivekn-sentiment-detector.R
        ├── testthat-word-embeddings.R
        ├── testthat-xlm-roberta-embeddings.R
        ├── testthat-xlm_roberta-for-token-classification.R
        ├── testthat-xlm_roberta_sentence_embeddings.R
        ├── testthat-xlnet-embeddings.R
        ├── testthat-xlnet-for-token-classification.R
        ├── testthat-yake_model.R
        └── tf_graphs
            ├── RE_in1200D_out20.pb
            ├── blstm_34_32_30_200_6.pb
            └── blstm_5_200_128_67.pb


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^internal$
4 | ^spark-warehouse$
5 | ^logs$
6 | ^LICENSE\.md$
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | internal
 6 | logs
 7 | spark-warehouse
 8 | derby.log
 9 | scripts
10 | examples/*.html
11 | *.nb.html
12 | tests/testthat/training_logs/
13 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: sparknlp
 2 | Type: Package
 3 | Title: R Interface to John Snow Labs Spark NLP
 4 | Version: 0.16.0
 5 | Authors@R: c(
 6 |     person("Dave", "Kincaid", email = "dave@kincaidlabs.ai", role = c("aut")),
 7 |     person("Kevin", "Kuo", email = "kevin.kuo@rstudio.com", role = c("aut", "cre"), 
 8 |            comment = c(ORCID = "0000-0001-7803-7901")),
 9 |     person(family = "RStudio", role = c("cph"))
10 |     )
11 | Maintainer: Dave Kincaid <kincaid.dave@gmail.com>
12 | Description: This package implements functions to use John Snow Labs Spark NLP with sparklyr.
13 | License: Apache License (>= 2.0)
14 | BugReports: https://github.com/rstudio/sparknlp/issues
15 | Encoding: UTF-8
16 | LazyData: true
17 | Depends:
18 |   R (>= 3.1.2),
19 |   sparklyr (>= 1.2.0.9000)
20 | Imports:
21 |   forge
22 | Roxygen: list(markdown = TRUE)
23 | RoxygenNote: 7.1.2
24 | Suggests: 
25 |     testthat (>= 3.0.0)
26 | Config/testthat/edition: 3
27 | 


--------------------------------------------------------------------------------
/R/java.R:
--------------------------------------------------------------------------------
 1 | #' @import forge
 2 | read_as <- function(sc, value) {
 3 |   value <- forge::cast_choice(value, c("TEXT", "BINARY", "SPARK"))
 4 |   invoke_static(sc, "com.johnsnowlabs.nlp.util.io.ReadAs", value)
 5 | }
 6 | 
 7 | # As of Spark NLP 2.3.0 these functions are no longer necessary
 8 | # # Function to return default argument values for Scala constructors and static methods. Use "constructor" for the
 9 | # # method name if you want default constructor argument values
10 | # default_argument_static <- function(sc, class_name, method_name, arg_num) {
11 | #   module <- invoke_static(sc, paste0(class_name, "$"), "MODULE$")
12 | #   
13 | #   if (method_name == "constructor") {
14 | #     method_name = "apply"
15 | #   }
16 | #   
17 | #   default_name <- paste0(method_name, "$default$", arg_num)
18 | #   invoke(module, default_name)
19 | # }
20 | # 
21 | # # Function to return default argument values for Scala instance methods
22 | # default_argument <- function(x, method_name, arg_num) {
23 | #   default_name <- paste0(method_name, "$default$", arg_num)
24 | #   invoke(x, default_name) 
25 | # }
26 | 


--------------------------------------------------------------------------------
/R/light-pipeline.R:
--------------------------------------------------------------------------------
 1 | #' Spark NLP Light pipeline
 2 | #' 
 3 | #' LightPipelines are Spark ML pipelines converted into a single machine but multithreaded task, becoming more than 
 4 | #' 10x times faster for smaller amounts of data (small is relative, but 50k sentences is roughly a good maximum).
 5 | #' To use them, simply plug in a trained (fitted) pipeline.
 6 | #' 
 7 | #' @param x a trained (fitted) pipeline
 8 | #' @param parse_embeddings whether to parse the embeddings
 9 | #' 
10 | #' @return a LightPipeline object
11 | #'  
12 | #' @export
13 | #' 
14 | nlp_light_pipeline <- function(x, parse_embeddings = FALSE) {
15 |   UseMethod("nlp_light_pipeline", x)
16 | }
17 | 
18 | #' @export
19 | nlp_light_pipeline.nlp_pretrained_pipeline <- function(x, parse_embeddings = FALSE) {
20 |   new_nlp_light_pipeline(invoke(spark_jobj(x), "lightModel"))
21 | }
22 | 
23 | #' @export
24 | nlp_light_pipeline.ml_pipeline_model <- function(x, parse_embeddings = FALSE) {
25 |   sc <- spark_connection(x)
26 |   jobj <- invoke_new(sc, "com.johnsnowlabs.nlp.LightPipeline", spark_jobj(x), parse_embeddings)
27 |   new_nlp_light_pipeline(jobj)
28 | }
29 | 
30 | new_nlp_light_pipeline <- function(jobj) {
31 |   structure(list(.jobj = jobj), class = c("nlp_light_pipeline", "ml_pipeline_model", "ml_transformer"))
32 | }
33 | 
34 | #' @export
35 | spark_jobj.nlp_light_pipeline <- function(x, ...) {
36 |   x$.jobj
37 | }


--------------------------------------------------------------------------------
/R/resource_downloader.R:
--------------------------------------------------------------------------------
 1 | #' SparkNLP ResourceDownloader functions
 2 | #' 
 3 | #' ResourceDownloader provides functions to easily look for pretrained models & pipelines 
 4 | #' inside Spark NLP. You can filter models or pipelines via language, version,
 5 | #' or the name of the annotator
 6 | #' 
 7 | #' @param sc a spark_connect object
 8 | #' @param lang language to restrict the results to
 9 | #' @param version Spark NLP version to restrict results to
10 | #' 
11 | #' @return a markdown table containing the models or pipelines filtered by the provided arguments
12 | #' 
13 | #' @name nlp_resource_downloader
14 | #' @aliases ResourceDownloader 
15 | NULL
16 | 
17 | #' @rdname nlp_resource_downloader
18 | #' @export
19 | nlp_show_public_pipelines <- function(sc, lang = NULL, version = NULL) {
20 |   result <- sparklyr::invoke_static(sc, "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader", "showPublicPipelines", lang, version)
21 |   return(result)
22 | }
23 | 
24 | #' @param annotator name of annotator to restrict results
25 | #' @rdname nlp_resource_downloader
26 | #' @export
27 | nlp_show_public_models <- function(sc, annotator = NULL, lang = NULL, version = NULL) {
28 |   result <- sparklyr::invoke_static(sc, "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader", "showPublicModels", annotator, lang, version)
29 |   return(result)
30 | }
31 | 
32 | #' @param name name of object to clear
33 | #' @param language language to clear
34 | #' @param remote_loc remote_loc of models to clear
35 | #' @rdname nlp_resource_downloader
36 | #' @export
37 | nlp_clear_cache <- function(sc, name = NULL, language = NULL, remote_loc = NULL) {
38 |   result <- sparklyr::invoke_static(sc, "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader", "clearCache", name, language, remote_loc)
39 |   return(result)
40 | }
41 | 
42 | #' @rdname nlp_resource_downloader
43 | #' @export
44 | nlp_show_available_annotators <- function(sc) {
45 |   result <- sparklyr::invoke_static(sc, "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader", "showAvailableAnnotators")
46 |   return(result)
47 | }
48 | 


--------------------------------------------------------------------------------
/examples/annotation/Pretrained-MatchPattern-Pipeline.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Use pretrained match_pattern Pipeline"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | This notebook is adapted from John Snow Labs Jupyter/Python getting started notebook. See
 7 | https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/match-pattern-pipeline/Pretrained-MatchPattern-Pipeline.ipynb 
 8 | for that version.
 9 | 
10 | * DocumentAssembler
11 | * SentenceDetector
12 | * Tokenizer 
13 | * RegexMatcher (match phone numbers)
14 | 
15 | ```{r}
16 | library(sparklyr)
17 | library(sparknlp)
18 | library(dplyr)
19 | ```
20 | 
21 | # Let's create a Spark connection for our app
22 | ```{r}
23 | version <- Sys.getenv("SPARK_VERSION", unset = "2.4.0")
24 | 
25 | config <- sparklyr::spark_config()
26 |     
27 | options(sparklyr.sanitize.column.names.verbose = TRUE)
28 | options(sparklyr.verbose = TRUE)
29 | options(sparklyr.na.omit.verbose = TRUE)
30 | options(sparklyr.na.action.verbose = TRUE)
31 | sc <- sparklyr::spark_connect(master = "local", version = version, config = config)
32 | ```
33 | 
34 | This Pipeline can extract phone numbers in these formats:
35 | 
36 | 0689912549 <br/>
37 | +33698912549 <br/>
38 | +33 6 79 91 25 49 <br/>
39 | +33-6-79-91-25-49 <br/>
40 | (555)-555-5555 <br/>
41 | 555-555-5555 <br/>
42 | +1-238 6 79 91 25 49 <br/>
43 | +1-555-532-3455 <br/>
44 | +15555323455 <br/>
45 | +7 06 79 91 25 49
46 | 
47 | ```{r}
48 | pipeline <- nlp_pretrained_pipeline(sc, "match_pattern", lang = "en")
49 | ```
50 | 
51 | ```{r}
52 | result <- nlp_annotate(pipeline, "You should call Mr. Jon Doe at +33 1 79 01 22 89")
53 | ```
54 | 
55 | ```{r}
56 | pull(result, regex)[[1]][[1]][[4]]
57 | ```
58 | ```{r}
59 | result <- nlp_annotate(pipeline, "Ring me up dude! +1-334-179-1466")
60 | ```
61 | 
62 | ```{r}
63 | pull(result, regex)[[1]][[1]][[4]]
64 | ```
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/examples/annotation/entities.txt:
--------------------------------------------------------------------------------
1 | i think
2 | Feeling strangely
3 | guitar lessons


--------------------------------------------------------------------------------
/examples/tutorials/certification_trainings/english_models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/examples/tutorials/certification_trainings/english_models.png


--------------------------------------------------------------------------------
/examples/tutorials/certification_trainings/nerdl-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/examples/tutorials/certification_trainings/nerdl-model.png


--------------------------------------------------------------------------------
/examples/tutorials/certification_trainings/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/examples/tutorials/certification_trainings/pipeline.png


--------------------------------------------------------------------------------
/examples/tutorials/certification_trainings/pre-trained-pipelines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/examples/tutorials/certification_trainings/pre-trained-pipelines.png


--------------------------------------------------------------------------------
/examples/tutorials/certification_trainings/stacked_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/examples/tutorials/certification_trainings/stacked_pipeline.png


--------------------------------------------------------------------------------
/inst/java/sparknlp-2.4-2.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/inst/java/sparknlp-2.4-2.11.jar


--------------------------------------------------------------------------------
/inst/java/sparknlp-3.0-2.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/inst/java/sparknlp-3.0-2.12.jar


--------------------------------------------------------------------------------
/inst/java/sparknlp-3.1-2.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/inst/java/sparknlp-3.1-2.12.jar


--------------------------------------------------------------------------------
/man-roxygen/roxlate-input-output-params.R:
--------------------------------------------------------------------------------
1 | #' @param input_col Input column. String.
2 | #' @param output_col Output column. String.
3 | 


--------------------------------------------------------------------------------
/man-roxygen/roxlate-input-outputs-params.R:
--------------------------------------------------------------------------------
1 | #' @param input_col Input column. String.
2 | #' @param output_cols Output columns. String array.


--------------------------------------------------------------------------------
/man-roxygen/roxlate-inputs-output-params.R:
--------------------------------------------------------------------------------
1 | #' @param input_cols Input columns. String array.
2 | #' @param output_col Output column. String.
3 | 


--------------------------------------------------------------------------------
/man-roxygen/roxlate-inputs-outputs-params.R:
--------------------------------------------------------------------------------
1 | #' @param input_cols Input columns. String array. 
2 | #' @param output_cols Output columns. String array.
3 | 


--------------------------------------------------------------------------------
/man-roxygen/roxlate-nlp-algo.R:
--------------------------------------------------------------------------------
 1 | #' @param x A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.
 2 | #' @param uid A character string used to uniquely identify the ML estimator.
 3 | #'
 4 | #' @return The object returned depends on the class of \code{x}.
 5 | #'
 6 | #' \itemize{
 7 | #'   \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
 8 | #'   a Spark \code{Estimator} object and can be used to compose
 9 | #'   \code{Pipeline} objects.
10 | #'
11 | #'   \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
12 | #'   the NLP estimator appended to the pipeline.
13 | #'
14 | #'   \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
15 | #'   immediately fit with the input \code{tbl_spark}, returning an NLP model.
16 | #' }
17 | 


--------------------------------------------------------------------------------
/man-roxygen/roxlate-nlp-ml-algo.R:
--------------------------------------------------------------------------------
 1 | #' @param x A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.
 2 | #' @param uid A character string used to uniquely identify the ML estimator.
 3 | #'
 4 | #' @return The object returned depends on the class of \code{x}.
 5 | #'
 6 | #' \itemize{
 7 | #'   \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
 8 | #'   a Spark \code{Estimator} object and can be used to compose
 9 | #'   \code{Pipeline} objects.
10 | #'
11 | #'   \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
12 | #'   a default pretrained NLP model appended to the pipeline.
13 | #'
14 | #'   \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
15 | #'   immediately fit with the input \code{tbl_spark}, returning an NLP model.
16 | #' }
17 | 


--------------------------------------------------------------------------------
/man-roxygen/roxlate-nlp-transformer.R:
--------------------------------------------------------------------------------
 1 | #' @param x A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.
 2 | #' @param uid A character string used to uniquely identify the ML transformer.
 3 | #'
 4 | #' @return The object returned depends on the class of \code{x}.
 5 | #'
 6 | #' \itemize{
 7 | #'   \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_transformer} object. The object contains a pointer to
 8 | #'   a Spark \code{Transformer} object and can be used to compose
 9 | #'   \code{Pipeline} objects.
10 | #'
11 | #'   \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
12 | #'   the NLP transformer/annotator appended to the pipeline.
13 | #'
14 | #'   \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, a transformer is constructed then
15 | #'   immediately fit with the input \code{tbl_spark}, returning the transformed data frame.
16 | #' }
17 | 


--------------------------------------------------------------------------------
/man-roxygen/roxlate-pretrained-params.R:
--------------------------------------------------------------------------------
 1 | #' In most cases you can just leave the parameters NULL (except for the Spark connection) and the Spark NLP defaults
 2 | #' will be used.
 3 | #' 
 4 | #' @param sc A Spark connection
 5 | #' @param name the name of the model to load. If NULL will use the default value
 6 | #' @param lang the language of the model to be loaded. If NULL will use the default value
 7 | #' @param remote_loc the remote location of the model. If NULL will use the default value
 8 | #' 
 9 | #' @return The Spark NLP model with the pretrained model loaded
10 | 


--------------------------------------------------------------------------------
/man/as_pipeline_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pretrained-pipeline.R
 3 | \name{as_pipeline_model}
 4 | \alias{as_pipeline_model}
 5 | \title{Get the PipelineModel from a Spark NLP pretrained pipeline}
 6 | \usage{
 7 | as_pipeline_model(pipeline)
 8 | }
 9 | \arguments{
10 | \item{pretrained_pipeline}{the Spark NLP PretrainedPipeline object}
11 | }
12 | \value{
13 | the Spark ML pipeline model from the input
14 | }
15 | \description{
16 | Spark NLP pretrained pipelines are not Spark ML pipeline models. This function
17 | will retrieve the ML pipeline model from the pretrained pipeline object.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/nlp_albert_embeddings_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/albert-embeddings.R
 3 | \name{nlp_albert_embeddings_pretrained}
 4 | \alias{nlp_albert_embeddings_pretrained}
 5 | \title{Load a pretrained Spark NLP AlbertEmbeddings model}
 6 | \usage{
 7 | nlp_albert_embeddings_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   case_sensitive = NULL,
12 |   batch_size = NULL,
13 |   dimension = NULL,
14 |   lazy_annotator = NULL,
15 |   max_sentence_length = NULL,
16 |   storage_ref = NULL,
17 |   name = NULL,
18 |   lang = NULL,
19 |   remote_loc = NULL
20 | )
21 | }
22 | \arguments{
23 | \item{sc}{A Spark connection}
24 | 
25 | \item{input_cols}{Input columns. String array.}
26 | 
27 | \item{output_col}{Output column. String.}
28 | 
29 | \item{case_sensitive}{whether to treat the tokens as case insensitive when looking up their embedding}
30 | 
31 | \item{batch_size}{batch size}
32 | 
33 | \item{dimension}{the embedding dimension}
34 | 
35 | \item{lazy_annotator}{use as a lazy annotator or not}
36 | 
37 | \item{max_sentence_length}{set the maximum sentence length}
38 | 
39 | \item{storage_ref}{storage reference name}
40 | 
41 | \item{name}{the name of the model to load. If NULL will use the default value}
42 | 
43 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
44 | 
45 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
46 | }
47 | \value{
48 | The Spark NLP model with the pretrained model loaded
49 | }
50 | \description{
51 | Create a pretrained Spark NLP \code{AlbertEmbeddings} model
52 | }
53 | 


--------------------------------------------------------------------------------
/man/nlp_albert_token_classification_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/albert-for-token-classification.R
 3 | \name{nlp_albert_token_classification_pretrained}
 4 | \alias{nlp_albert_token_classification_pretrained}
 5 | \title{Spark NLP AlbertForTokenClassification}
 6 | \usage{
 7 | nlp_albert_token_classification_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   batch_size = NULL,
12 |   case_sensitive = NULL,
13 |   max_sentence_length = NULL,
14 |   name = NULL,
15 |   lang = NULL,
16 |   remote_loc = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{batch_size}{Size of every batch (Default depends on model).}
25 | 
26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)}
27 | 
28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)}
29 | 
30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
31 | 
32 | \item{uid}{A character string used to uniquely identify the ML estimator.}
33 | }
34 | \value{
35 | The object returned depends on the class of \code{x}.
36 | 
37 | \itemize{
38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
39 | a Spark \code{Estimator} object and can be used to compose
40 | \code{Pipeline} objects.
41 | 
42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
43 | the NLP estimator appended to the pipeline.
44 | 
45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
46 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
47 | }
48 | }
49 | \description{
50 | AlbertForTokenClassification can load ALBERT Models with a token classification head on top
51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#albertfortokenclassification}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/nlp_annotate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotate.R
 3 | \name{nlp_annotate}
 4 | \alias{nlp_annotate}
 5 | \title{Annotate some text}
 6 | \usage{
 7 | nlp_annotate(x, target, column = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{some SparkNLP object that has an annotate method that takes a Spark data frame as argument}
11 | 
12 | \item{target}{the text to annotate. This can be a character string, a character vector or a data frame (with the text
13 | in a field named "text")}
14 | 
15 | \item{column}{the column name containing text if a Spark DataFrame is passed in.}
16 | }
17 | \value{
18 | If given a character vector the return value is a list of lists containing the annotations.
19 | 
20 | If given a Spark DataFrame the return value is a Spark data frame containing the annotations
21 | }
22 | \description{
23 | Use SparkNLP to annotate some text.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/nlp_annotate_full.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotate.R
 3 | \name{nlp_annotate_full}
 4 | \alias{nlp_annotate_full}
 5 | \title{Fully annotate some text}
 6 | \usage{
 7 | nlp_annotate_full(x, target, column = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{some SparkNLP object that has an annotate method that takes a Spark data frame as argument}
11 | 
12 | \item{target}{the text to annotate. This can be a character string, a character vector or a data frame (with the text
13 | in a field named "text")}
14 | 
15 | \item{column}{the column name containing text if a Spark DataFrame is passed in.}
16 | }
17 | \value{
18 | If given a character vector the return value is a list of lists containing the annotations.
19 | 
20 | If given a Spark DataFrame the return value is a Spark data frame containing the annotations
21 | }
22 | \description{
23 | Use Spark NLP to fully annotate some text.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/nlp_annotation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotation.R
 3 | \name{nlp_annotation}
 4 | \alias{nlp_annotation}
 5 | \title{Spark NLP S3 Annotation object}
 6 | \usage{
 7 | nlp_annotation(x)
 8 | }
 9 | \arguments{
10 | \item{x}{a spark_jobj that is an Annotation object or a named list}
11 | }
12 | \value{
13 | a local nlp_annotation object
14 | }
15 | \description{
16 | A Spark NLP annotation S3 object has the following fields:
17 | \itemize{
18 | \item annotatorType: the type of annotation (String)
19 | \item begin: the index of the first character under this annotation (integer)
20 | \item end: the index after the last character under this annotation (integer)
21 | \item metadata: associated metadata for this annotation (Map(String, String))
22 | \item result: the main output of the annotation (String)
23 | \item embeddings: vector of embeddings (Array(Float))
24 | }
25 | }
26 | \details{
27 | See \url{https://nlp.johnsnowlabs.com/docs/en/concepts#annotation}
28 | }
29 | 


--------------------------------------------------------------------------------
/man/nlp_annotation_read_dataset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotation_tool_json_reader.R
 3 | \name{nlp_annotation_read_dataset}
 4 | \alias{nlp_annotation_read_dataset}
 5 | \title{Create a data frame from an AnnotationToolJsonReader}
 6 | \usage{
 7 | nlp_annotation_read_dataset(reader, json_path)
 8 | }
 9 | \arguments{
10 | \item{reader}{an instance of AnnotationToolJsonReader \code{\link{nlp_annotation_tool_json_reader}}}
11 | 
12 | \item{json_path}{path to the json from annotation lab export}
13 | }
14 | \value{
15 | assertion train set data frame
16 | }
17 | \description{
18 | Create a data frame from an AnnotationToolJsonReader
19 | }
20 | 


--------------------------------------------------------------------------------
/man/nlp_annotation_tool_json_reader.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotation_tool_json_reader.R
 3 | \name{nlp_annotation_tool_json_reader}
 4 | \alias{nlp_annotation_tool_json_reader}
 5 | \title{Spark NLP AnnotationToolJsonReader}
 6 | \usage{
 7 | nlp_annotation_tool_json_reader(
 8 |   sc,
 9 |   assertion_labels = list(),
10 |   excluded_labels = list(),
11 |   cleanup_mode = "disabled",
12 |   split_chars = list(),
13 |   context_chars = list(),
14 |   scheme = "IOB",
15 |   min_chars_tol = 2L,
16 |   align_chars_tol = 1L,
17 |   merge_overlapping = TRUE,
18 |   sddl_path = ""
19 | )
20 | }
21 | \arguments{
22 | \item{assertion_labels}{list of strings}
23 | 
24 | \item{excluded_labels}{list of strings}
25 | 
26 | \item{cleanup_mode}{string (Default: disabled)}
27 | 
28 | \item{split_chars}{list of strings}
29 | 
30 | \item{context_chars}{list of strings}
31 | 
32 | \item{scheme}{string (Default: "IOB")}
33 | 
34 | \item{min_chars_tol}{integer (Default: 2)}
35 | 
36 | \item{align_chars_tol}{integer (Default: 1)}
37 | 
38 | \item{merge_overlapping}{boolean (Default: true)}
39 | 
40 | \item{sddl_path}{string (Default: "")}
41 | }
42 | \value{
43 | assertion train set
44 | }
45 | \description{
46 | The annotation tool json reader is a reader that generate a assertion train set from the json from annotations labs exports.
47 | }
48 | 


--------------------------------------------------------------------------------
/man/nlp_assertion_dl_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/assertion_dl.R
 3 | \name{nlp_assertion_dl_pretrained}
 4 | \alias{nlp_assertion_dl_pretrained}
 5 | \title{Load a pretrained Spark NLP Assertion DL model}
 6 | \usage{
 7 | nlp_assertion_dl_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   batch_size = NULL,
12 |   scope_window = NULL,
13 |   max_sent_len = NULL,
14 |   storage_ref = NULL,
15 |   name = NULL,
16 |   lang = NULL,
17 |   remote_loc = NULL
18 | )
19 | }
20 | \arguments{
21 | \item{sc}{A Spark connection}
22 | 
23 | \item{input_cols}{Input columns. String array.}
24 | 
25 | \item{output_col}{Output column. String.}
26 | 
27 | \item{batch_size}{Parameter, which regulates the size of the batch}
28 | 
29 | \item{scope_window}{The scope window of the assertion (whole sentence by default)}
30 | 
31 | \item{max_sent_len}{Parameter, which regulates the length of the longest sentence}
32 | 
33 | \item{storage_ref}{storage reference for embeddings}
34 | 
35 | \item{name}{the name of the model to load. If NULL will use the default value}
36 | 
37 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
38 | 
39 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
40 | }
41 | \value{
42 | The Spark NLP model with the pretrained model loaded
43 | }
44 | \description{
45 | Create a pretrained Spark NLP \code{AssertionDLModel} model
46 | }
47 | 


--------------------------------------------------------------------------------
/man/nlp_assertion_filterer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/assertion_filterer.R
 3 | \name{nlp_assertion_filterer}
 4 | \alias{nlp_assertion_filterer}
 5 | \title{Spark NLP AssertionFilterer}
 6 | \usage{
 7 | nlp_assertion_filterer(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   criteria = NULL,
12 |   whitelist = NULL,
13 |   regex = NULL,
14 |   uid = random_string("assertion_filterer_")
15 | )
16 | }
17 | \arguments{
18 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
19 | 
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{criteria}{isin or regex}
25 | 
26 | \item{whitelist}{If defined, list of entities to process.}
27 | 
28 | \item{regex}{If defined, list of entities to process.}
29 | 
30 | \item{uid}{A character string used to uniquely identify the ML estimator.}
31 | }
32 | \value{
33 | The object returned depends on the class of \code{x}.
34 | 
35 | \itemize{
36 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
37 | a Spark \code{Estimator} object and can be used to compose
38 | \code{Pipeline} objects.
39 | 
40 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
41 | the NLP estimator appended to the pipeline.
42 | 
43 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
44 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
45 | }
46 | }
47 | \description{
48 | Spark ML transformer that will allow you to filter out the named entities by
49 | the list of acceptable assertion statuses. This annotator would be quite handy
50 | if you want to set a white list for the acceptable assertion statuses like
51 | present or conditional; and do not want absent conditions get out of your pipeline.
52 | See \url{https://nlp.johnsnowlabs.com/docs/en/licensed_release_notes#3-assertionfilterer}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/nlp_assertion_logreg_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/assertion_logreg.R
 3 | \name{nlp_assertion_logreg_pretrained}
 4 | \alias{nlp_assertion_logreg_pretrained}
 5 | \title{Load a pretrained Spark NLP Assertion LogReg model}
 6 | \usage{
 7 | nlp_assertion_logreg_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   before = NULL,
12 |   after = NULL,
13 |   start_col = NULL,
14 |   end_col = NULL,
15 |   lazy_annotator = NULL,
16 |   storage_ref = NULL,
17 |   name = NULL,
18 |   lang = NULL,
19 |   remote_loc = NULL
20 | )
21 | }
22 | \arguments{
23 | \item{sc}{A Spark connection}
24 | 
25 | \item{input_cols}{Input columns. String array.}
26 | 
27 | \item{output_col}{Output column. String.}
28 | 
29 | \item{before}{Amount of tokens from the context before the target}
30 | 
31 | \item{after}{Amount of tokens from the context after the target}
32 | 
33 | \item{start_col}{Column that contains the token number for the start of the target}
34 | 
35 | \item{end_col}{Column that contains the token number for the end of the target}
36 | 
37 | \item{lazy_annotator}{a Param in Annotators that allows them to stand idle in the Pipeline and do nothing. Can be called by other Annotators in a RecursivePipeline}
38 | 
39 | \item{storage_ref}{storage reference for embeddings}
40 | 
41 | \item{name}{the name of the model to load. If NULL will use the default value}
42 | 
43 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
44 | 
45 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
46 | }
47 | \value{
48 | The Spark NLP model with the pretrained model loaded
49 | }
50 | \description{
51 | Create a pretrained Spark NLP \code{AssertionLogRegModel} model
52 | }
53 | 


--------------------------------------------------------------------------------
/man/nlp_bert_embeddings_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bert-embeddings.R
 3 | \name{nlp_bert_embeddings_pretrained}
 4 | \alias{nlp_bert_embeddings_pretrained}
 5 | \title{Load a pretrained Spark NLP BertEmbeddings model}
 6 | \usage{
 7 | nlp_bert_embeddings_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   case_sensitive = NULL,
12 |   batch_size = NULL,
13 |   dimension = NULL,
14 |   lazy_annotator = NULL,
15 |   max_sentence_length = NULL,
16 |   storage_ref = NULL,
17 |   name = NULL,
18 |   lang = NULL,
19 |   remote_loc = NULL
20 | )
21 | }
22 | \arguments{
23 | \item{sc}{A Spark connection}
24 | 
25 | \item{input_cols}{Input columns. String array.}
26 | 
27 | \item{output_col}{Output column. String.}
28 | 
29 | \item{case_sensitive}{whether to treat the tokens as case insensitive when looking up their embedding}
30 | 
31 | \item{batch_size}{batch size}
32 | 
33 | \item{dimension}{the embedding dimension}
34 | 
35 | \item{lazy_annotator}{use as a lazy annotator or not}
36 | 
37 | \item{max_sentence_length}{set the maximum sentence length}
38 | 
39 | \item{storage_ref}{storage reference name}
40 | 
41 | \item{name}{the name of the model to load. If NULL will use the default value}
42 | 
43 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
44 | 
45 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
46 | }
47 | \value{
48 | The Spark NLP model with the pretrained model loaded
49 | }
50 | \description{
51 | Create a pretrained Spark NLP \code{BertEmbeddings} model
52 | }
53 | 


--------------------------------------------------------------------------------
/man/nlp_bert_sentence_embeddings_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bert_sentence_embeddings.R
 3 | \name{nlp_bert_sentence_embeddings_pretrained}
 4 | \alias{nlp_bert_sentence_embeddings_pretrained}
 5 | \title{Load a pretrained Spark NLP BertSentenceEmbeddings model}
 6 | \usage{
 7 | nlp_bert_sentence_embeddings_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   case_sensitive = NULL,
12 |   batch_size = NULL,
13 |   dimension = NULL,
14 |   max_sentence_length = NULL,
15 |   name = NULL,
16 |   lang = NULL,
17 |   remote_loc = NULL
18 | )
19 | }
20 | \arguments{
21 | \item{sc}{A Spark connection}
22 | 
23 | \item{input_cols}{Input columns. String array.}
24 | 
25 | \item{output_col}{Output column. String.}
26 | 
27 | \item{case_sensitive}{whether to lowercase tokens or not}
28 | 
29 | \item{batch_size}{batch size}
30 | 
31 | \item{dimension}{defines the output layer of BERT when calculating embeddings}
32 | 
33 | \item{max_sentence_length}{max sentence length to process}
34 | 
35 | \item{name}{the name of the model to load. If NULL will use the default value}
36 | 
37 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
38 | 
39 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
40 | }
41 | \value{
42 | The Spark NLP model with the pretrained model loaded
43 | }
44 | \description{
45 | Create a pretrained Spark NLP \code{BertSentenceEmbeddings} model.
46 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#bertsentenceembeddings}
47 | }
48 | 


--------------------------------------------------------------------------------
/man/nlp_bert_token_classification_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bert-for-token-classification.R
 3 | \name{nlp_bert_token_classification_pretrained}
 4 | \alias{nlp_bert_token_classification_pretrained}
 5 | \title{Spark NLP BertForTokenClassification}
 6 | \usage{
 7 | nlp_bert_token_classification_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   batch_size = NULL,
12 |   case_sensitive = NULL,
13 |   max_sentence_length = NULL,
14 |   name = NULL,
15 |   lang = NULL,
16 |   remote_loc = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{batch_size}{Size of every batch (Default depends on model).}
25 | 
26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)}
27 | 
28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)}
29 | 
30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
31 | 
32 | \item{uid}{A character string used to uniquely identify the ML estimator.}
33 | }
34 | \value{
35 | The object returned depends on the class of \code{x}.
36 | 
37 | \itemize{
38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
39 | a Spark \code{Estimator} object and can be used to compose
40 | \code{Pipeline} objects.
41 | 
42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
43 | the NLP estimator appended to the pipeline.
44 | 
45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
46 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
47 | }
48 | }
49 | \description{
50 | BertForTokenClassification can load Bert Models with a token classification head on top
51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#bertfortokenclassification}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/nlp_chunk2doc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chunk2doc.R
 3 | \name{nlp_chunk2doc}
 4 | \alias{nlp_chunk2doc}
 5 | \title{Spark NLP Chunk2Doc}
 6 | \usage{
 7 | nlp_chunk2doc(x, input_cols, output_col, uid = random_string("chunk2doc_"))
 8 | }
 9 | \arguments{
10 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
11 | 
12 | \item{input_cols}{Input columns. String array.}
13 | 
14 | \item{output_col}{Output column. String.}
15 | 
16 | \item{uid}{A character string used to uniquely identify the ML estimator.}
17 | }
18 | \value{
19 | The object returned depends on the class of \code{x}.
20 | 
21 | \itemize{
22 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
23 | a Spark \code{Estimator} object and can be used to compose
24 | \code{Pipeline} objects.
25 | 
26 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
27 | the NLP estimator appended to the pipeline.
28 | 
29 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
30 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
31 | }
32 | }
33 | \description{
34 | Spark ML transformer that Converts a CHUNK type column back into DOCUMENT. Useful when trying to re-tokenize or do
35 | further analysis on a CHUNK result.
36 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#chunk2doc}
37 | }
38 | 


--------------------------------------------------------------------------------
/man/nlp_chunk2token.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chunk2token.R
 3 | \name{nlp_chunk2token}
 4 | \alias{nlp_chunk2token}
 5 | \title{Spark NLP Chunk2Token}
 6 | \usage{
 7 | nlp_chunk2token(x, input_cols, output_col, uid = random_string("chunk2token_"))
 8 | }
 9 | \arguments{
10 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
11 | 
12 | \item{input_cols}{Input columns. String array.}
13 | 
14 | \item{output_col}{Output column. String.}
15 | 
16 | \item{uid}{A character string used to uniquely identify the ML estimator.}
17 | }
18 | \value{
19 | The object returned depends on the class of \code{x}.
20 | 
21 | \itemize{
22 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
23 | a Spark \code{Estimator} object and can be used to compose
24 | \code{Pipeline} objects.
25 | 
26 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
27 | the NLP estimator appended to the pipeline.
28 | 
29 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
30 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
31 | }
32 | }
33 | \description{
34 | Spark ML transformer that
35 | See \url{https://nlp.johnsnowlabs.com/docs/en/licensed_annotators#chunk2token}
36 | }
37 | 


--------------------------------------------------------------------------------
/man/nlp_chunk_embeddings.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chunk-embeddings.R
 3 | \name{nlp_chunk_embeddings}
 4 | \alias{nlp_chunk_embeddings}
 5 | \title{Spark NLP ChunkEmbeddings}
 6 | \usage{
 7 | nlp_chunk_embeddings(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   pooling_strategy = NULL,
12 |   uid = random_string("chunk_embeddings_")
13 | )
14 | }
15 | \arguments{
16 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
17 | 
18 | \item{input_cols}{Input columns. String array.}
19 | 
20 | \item{output_col}{Output column. String.}
21 | 
22 | \item{pooling_strategy}{Choose how you would like to aggregate Word Embeddings to Sentence Embeddings: AVERAGE or SUM}
23 | 
24 | \item{uid}{A character string used to uniquely identify the ML estimator.}
25 | }
26 | \value{
27 | The object returned depends on the class of \code{x}.
28 | 
29 | \itemize{
30 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
31 | a Spark \code{Estimator} object and can be used to compose
32 | \code{Pipeline} objects.
33 | 
34 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
35 | the NLP estimator appended to the pipeline.
36 | 
37 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
38 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
39 | }
40 | }
41 | \description{
42 | Spark ML transformer that utilizes WordEmbeddings or BertEmbeddings to generate chunk embeddings from either Chunker,
43 | NGramGenerator, or NerConverter outputs.
44 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#chunkembeddings}
45 | }
46 | 


--------------------------------------------------------------------------------
/man/nlp_chunk_filterer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chunk_filterer.R
 3 | \name{nlp_chunk_filterer}
 4 | \alias{nlp_chunk_filterer}
 5 | \title{Spark NLP ChunkFilterer}
 6 | \usage{
 7 | nlp_chunk_filterer(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   criteria = NULL,
12 |   whitelist = NULL,
13 |   regex = NULL,
14 |   uid = random_string("chunk_filterer_")
15 | )
16 | }
17 | \arguments{
18 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
19 | 
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{criteria}{isin or regex}
25 | 
26 | \item{whitelist}{If defined, list of entities to process.}
27 | 
28 | \item{regex}{If defined, list of entities to process.}
29 | 
30 | \item{uid}{A character string used to uniquely identify the ML estimator.}
31 | }
32 | \value{
33 | The object returned depends on the class of \code{x}.
34 | 
35 | \itemize{
36 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
37 | a Spark \code{Estimator} object and can be used to compose
38 | \code{Pipeline} objects.
39 | 
40 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
41 | the NLP estimator appended to the pipeline.
42 | 
43 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
44 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
45 | }
46 | }
47 | \description{
48 | Spark ML transformer that will filter out named entities by some conditions
49 | or predefined look-up lists, so that you can feed these entities to other
50 | annotators like Assertion Status or Entity Resolvers. It can be used with
51 | two criteria: isin and regex.
52 | See \url{https://nlp.johnsnowlabs.com/docs/en/licensed_release_notes#2-chunkfilterer}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/nlp_chunker.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chunker.R
 3 | \name{nlp_chunker}
 4 | \alias{nlp_chunker}
 5 | \title{Spark NLP Chunker - Meaningful phrase matching}
 6 | \usage{
 7 | nlp_chunker(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   regex_parsers = NULL,
12 |   uid = random_string("chunker_")
13 | )
14 | }
15 | \arguments{
16 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
17 | 
18 | \item{input_cols}{Input columns. String array.}
19 | 
20 | \item{output_col}{Output column. String.}
21 | 
22 | \item{regex_parsers}{the regular expression parsers to use for the chunking}
23 | 
24 | \item{uid}{A character string used to uniquely identify the ML estimator.}
25 | }
26 | \value{
27 | The object returned depends on the class of \code{x}.
28 | 
29 | \itemize{
30 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
31 | a Spark \code{Estimator} object and can be used to compose
32 | \code{Pipeline} objects.
33 | 
34 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
35 | the NLP estimator appended to the pipeline.
36 | 
37 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
38 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
39 | }
40 | }
41 | \description{
42 | Spark ML transformer that matches a pattern of part-of-speech tags in order to return meaningful phrases from document
43 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#chunker}
44 | }
45 | 


--------------------------------------------------------------------------------
/man/nlp_classifier_dl_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/classifier_dl.R
 3 | \name{nlp_classifier_dl_pretrained}
 4 | \alias{nlp_classifier_dl_pretrained}
 5 | \title{Load a pretrained Spark NLP Classifier DL model}
 6 | \usage{
 7 | nlp_classifier_dl_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   include_confidence = NULL,
12 |   name = NULL,
13 |   lang = NULL,
14 |   remote_loc = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{sc}{A Spark connection}
19 | 
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{include_confidence}{whether to include the confidence scores in the predictions}
25 | 
26 | \item{name}{the name of the model to load. If NULL will use the default value}
27 | 
28 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
29 | 
30 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
31 | }
32 | \value{
33 | The Spark NLP model with the pretrained model loaded
34 | }
35 | \description{
36 | Create a pretrained Spark NLP \code{ClassifierDLModel} model
37 | }
38 | 


--------------------------------------------------------------------------------
/man/nlp_conllu_read_dataset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{nlp_conllu_read_dataset}
 4 | \alias{nlp_conllu_read_dataset}
 5 | \title{Transform CoNLLU format text file to Spark dataframe}
 6 | \usage{
 7 | nlp_conllu_read_dataset(sc, path, read_as = NULL, explode_sentences = NULL)
 8 | }
 9 | \arguments{
10 | \item{sc}{a Spark connection}
11 | 
12 | \item{path}{path to the file to read}
13 | 
14 | \item{read_as}{Can be LINE_BY_LINE or SPARK_DATASET, with options if latter is used (default LINE_BY_LINE)}
15 | }
16 | \description{
17 | In order to train a Lemmatizer annotator, we need to get CoNLLU format data as a spark dataframe.
18 | There is a component that does this for us: it reads a plain text file and transforms it to a spark dataset.
19 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#conllu-dataset}. All the function arguments have defaults.
20 | See \url{https://nlp.johnsnowlabs.com/api/index.html#com.johnsnowlabs.nlp.training.CoNLLU} for the defaults.
21 | }
22 | 


--------------------------------------------------------------------------------
/man/nlp_context_spell_checker_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/context-spell-checker.R
 3 | \name{nlp_context_spell_checker_pretrained}
 4 | \alias{nlp_context_spell_checker_pretrained}
 5 | \title{Load a pretrained Spark NLP ContextSpellChecker model}
 6 | \usage{
 7 | nlp_context_spell_checker_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Create a pretrained Spark NLP \code{ContextSpellChecker} model
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_dependency_parser_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dependency-parser.R
 3 | \name{nlp_dependency_parser_pretrained}
 4 | \alias{nlp_dependency_parser_pretrained}
 5 | \title{Load a pretrained Spark NLP model}
 6 | \usage{
 7 | nlp_dependency_parser_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Create a pretrained Spark NLP \code{DependencyParserModel} model
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_distilbert_token_classification_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/distilbert-for-token-classification.R
 3 | \name{nlp_distilbert_token_classification_pretrained}
 4 | \alias{nlp_distilbert_token_classification_pretrained}
 5 | \title{Spark NLP DistilBertForTokenClassification}
 6 | \usage{
 7 | nlp_distilbert_token_classification_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   batch_size = NULL,
12 |   case_sensitive = NULL,
13 |   max_sentence_length = NULL,
14 |   name = NULL,
15 |   lang = NULL,
16 |   remote_loc = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{batch_size}{Size of every batch (Default depends on model).}
25 | 
26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)}
27 | 
28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)}
29 | 
30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
31 | 
32 | \item{uid}{A character string used to uniquely identify the ML estimator.}
33 | }
34 | \value{
35 | The object returned depends on the class of \code{x}.
36 | 
37 | \itemize{
38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
39 | a Spark \code{Estimator} object and can be used to compose
40 | \code{Pipeline} objects.
41 | 
42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
43 | the NLP estimator appended to the pipeline.
44 | 
45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
46 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
47 | }
48 | }
49 | \description{
50 | DistilBertForTokenClassification can load Bert Models with a token classification head on top
51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#distilbertfortokenclassification}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/nlp_drug_normalizer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/drug_normalizer.R
 3 | \name{nlp_drug_normalizer}
 4 | \alias{nlp_drug_normalizer}
 5 | \title{Spark NLP DrugNormalizer}
 6 | \usage{
 7 | nlp_drug_normalizer(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   lower_case = NULL,
12 |   policy = NULL,
13 |   uid = random_string("drug_normalizer_")
14 | )
15 | }
16 | \arguments{
17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{lower_case}{whether to convert strings to lowercase}
24 | 
25 | \item{policy}{removalPolicy to remove patterns from text with a given policy}
26 | 
27 | \item{uid}{A character string used to uniquely identify the ML estimator.}
28 | }
29 | \value{
30 | The object returned depends on the class of \code{x}.
31 | 
32 | \itemize{
33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
34 | a Spark \code{Estimator} object and can be used to compose
35 | \code{Pipeline} objects.
36 | 
37 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
38 | the NLP estimator appended to the pipeline.
39 | 
40 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
41 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
42 | }
43 | }
44 | \description{
45 | Spark ML transformer that normalizes raw text from clinical documents, e.g.
46 | scraped web pages or xml documents, from document type columns into Sentence.
47 | Removes all dirty characters from text following one or more input regex
48 | patterns. Can apply non wanted character removal which a specific policy.
49 | Can apply lower case normalization.
50 | See \url{https://nlp.johnsnowlabs.com/licensed/api/index.html#com.johnsnowlabs.nlp.annotators.DrugNormalizer}
51 | }
52 | 


--------------------------------------------------------------------------------
/man/nlp_elmo_embeddings_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/elmo-embeddings.R
 3 | \name{nlp_elmo_embeddings_pretrained}
 4 | \alias{nlp_elmo_embeddings_pretrained}
 5 | \title{Load a pretrained Spark NLP ElmoEmbeddings model}
 6 | \usage{
 7 | nlp_elmo_embeddings_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   case_sensitive = NULL,
12 |   batch_size = NULL,
13 |   dimension = NULL,
14 |   pooling_layer = NULL,
15 |   name = NULL,
16 |   lang = NULL,
17 |   remote_loc = NULL
18 | )
19 | }
20 | \arguments{
21 | \item{sc}{A Spark connection}
22 | 
23 | \item{input_cols}{Input columns. String array.}
24 | 
25 | \item{output_col}{Output column. String.}
26 | 
27 | \item{case_sensitive}{whether to treat the tokens as case insensitive when looking up their embedding}
28 | 
29 | \item{batch_size}{batch size}
30 | 
31 | \item{dimension}{the embedding dimension}
32 | 
33 | \item{pooling_layer}{word_emb, lstm_outputs1, lstm_outputs2 or elmo}
34 | 
35 | \item{name}{the name of the model to load. If NULL will use the default value}
36 | 
37 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
38 | 
39 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
40 | }
41 | \value{
42 | The Spark NLP model with the pretrained model loaded
43 | }
44 | \description{
45 | Create a pretrained Spark NLP \code{ElmoEmbeddings} model
46 | }
47 | 


--------------------------------------------------------------------------------
/man/nlp_embeddings_finisher.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/embeddings_finisher.R
 3 | \name{nlp_embeddings_finisher}
 4 | \alias{nlp_embeddings_finisher}
 5 | \title{Spark NLP EmbeddingsFinisher}
 6 | \usage{
 7 | nlp_embeddings_finisher(
 8 |   x,
 9 |   input_cols,
10 |   output_cols,
11 |   clean_annotations = NULL,
12 |   output_as_vector = NULL,
13 |   uid = random_string("embeddings_finisher_")
14 | )
15 | }
16 | \arguments{
17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_cols}{Output columns. String array.}
22 | 
23 | \item{clean_annotations}{Whether to remove and cleanup the rest of the annotators (columns)}
24 | 
25 | \item{output_as_vector}{if enabled, it will output the embeddings as Vectors instead of arrays}
26 | 
27 | \item{uid}{A character string used to uniquely identify the ML estimator.}
28 | }
29 | \value{
30 | The object returned depends on the class of \code{x}.
31 | 
32 | \itemize{
33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
34 | a Spark \code{Estimator} object and can be used to compose
35 | \code{Pipeline} objects.
36 | 
37 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
38 | the NLP estimator appended to the pipeline.
39 | 
40 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
41 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
42 | }
43 | }
44 | \description{
45 | Spark ML transformer that is designed to deal with embedding annotators: WordEmbeddings, BertEmbeddings,
46 | SentenceEmbeddingd, and ChunkEmbeddings. By using EmbeddingsFinisher you can easily transform your embeddings
47 | into array of floats or Vectors which are compatible with Spark ML functions such as LDA, K-mean, Random Forest
48 | classifier or any other functions that require featureCol.
49 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#embeddingsfinisher}
50 | }
51 | 


--------------------------------------------------------------------------------
/man/nlp_finisher.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/finisher.R
 3 | \name{nlp_finisher}
 4 | \alias{nlp_finisher}
 5 | \title{Spark NLP Finisher}
 6 | \usage{
 7 | nlp_finisher(
 8 |   x,
 9 |   input_cols,
10 |   output_cols = NULL,
11 |   clean_annotations = NULL,
12 |   value_split_symbol = NULL,
13 |   annotation_split_symbol = NULL,
14 |   include_metadata = NULL,
15 |   output_as_array = NULL,
16 |   uid = random_string("finisher_")
17 | )
18 | }
19 | \arguments{
20 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
21 | 
22 | \item{input_cols}{Input columns. String array.}
23 | 
24 | \item{output_cols}{Output columns. String array.}
25 | 
26 | \item{clean_annotations}{Boolean. Whether to remove intermediate annotations}
27 | 
28 | \item{value_split_symbol}{String. Optional. Split values within an annotation character}
29 | 
30 | \item{annotation_split_symbol}{String. Optional. Split values between annotations character}
31 | 
32 | \item{include_metadata}{Boolean. Optional. Whether to include metadata keys. Sometimes useful in some annotations}
33 | 
34 | \item{output_as_array}{Boolean. Optional. Whether to output as Array. Useful as input for other Spark transformers}
35 | 
36 | \item{uid}{A character string used to uniquely identify the ML estimator.}
37 | }
38 | \value{
39 | The object returned depends on the class of \code{x}.
40 | 
41 | \itemize{
42 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
43 | a Spark \code{Estimator} object and can be used to compose
44 | \code{Pipeline} objects.
45 | 
46 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
47 | the NLP estimator appended to the pipeline.
48 | 
49 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
50 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
51 | }
52 | }
53 | \description{
54 | Spark ML transformer that outputs annotation(s) values into string.
55 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#finisher}
56 | }
57 | 


--------------------------------------------------------------------------------
/man/nlp_generate_assertion_train_set.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotation_tool_json_reader.R
 3 | \name{nlp_generate_assertion_train_set}
 4 | \alias{nlp_generate_assertion_train_set}
 5 | \title{Generate an assertion training set from an AnnotationToolJsonReader}
 6 | \usage{
 7 | nlp_generate_assertion_train_set(
 8 |   reader,
 9 |   df,
10 |   sentence_col = "sentence",
11 |   assertion_col = "assertion_label"
12 | )
13 | }
14 | \arguments{
15 | \item{reader}{an instance of AnnotationToolJsonReader \code{\link{nlp_annotation_tool_json_reader}}}
16 | 
17 | \item{df}{a Spark Dataframe}
18 | 
19 | \item{sentence_col}{the name of the sentence column}
20 | 
21 | \item{assertion_col}{the name of the assertion column}
22 | }
23 | \value{
24 | assertion training set data frame
25 | }
26 | \description{
27 | Generate an assertion training set from an AnnotationToolJsonReader
28 | }
29 | 


--------------------------------------------------------------------------------
/man/nlp_generate_colln.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotation_tool_json_reader.R
 3 | \name{nlp_generate_colln}
 4 | \alias{nlp_generate_colln}
 5 | \title{Generate a CoNLL format file from a data frame using an AnnotationToolJsonReader}
 6 | \usage{
 7 | nlp_generate_colln(
 8 |   reader,
 9 |   df,
10 |   path,
11 |   task_col = "task_id",
12 |   token_col = "token",
13 |   ner_label = "ner_label"
14 | )
15 | }
16 | \arguments{
17 | \item{reader}{an instance of AnnotationToolJsonReader \code{\link{nlp_annotation_tool_json_reader}}}
18 | 
19 | \item{df}{a Spark Dataframe}
20 | 
21 | \item{task_col}{the name of the task column}
22 | 
23 | \item{token_col}{the name of the token column}
24 | 
25 | \item{ner_label}{the name of the ner label column}
26 | }
27 | \description{
28 | Generate a CoNLL format file from a data frame using an AnnotationToolJsonReader
29 | }
30 | 


--------------------------------------------------------------------------------
/man/nlp_generate_plain_assertion_train_set.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotation_tool_json_reader.R
 3 | \name{nlp_generate_plain_assertion_train_set}
 4 | \alias{nlp_generate_plain_assertion_train_set}
 5 | \title{Generate a plain assertion training set from an AnnotationToolJsonReader}
 6 | \usage{
 7 | nlp_generate_plain_assertion_train_set(
 8 |   reader,
 9 |   df,
10 |   task_col = "task_id",
11 |   token_col = "token",
12 |   ner_label = "ner_label",
13 |   assertion_label = "assertion_label"
14 | )
15 | }
16 | \arguments{
17 | \item{reader}{an instance of AnnotationToolJsonReader \code{\link{nlp_annotation_tool_json_reader}}}
18 | 
19 | \item{df}{a Spark Dataframe}
20 | 
21 | \item{task_col}{the name of the task column}
22 | 
23 | \item{token_col}{the name of the token column}
24 | 
25 | \item{ner_label}{the name of the ner label column}
26 | 
27 | \item{assertion_col}{the name of the assertion column}
28 | }
29 | \value{
30 | assertion training set data frame
31 | }
32 | \description{
33 | Generate a plain assertion training set from an AnnotationToolJsonReader
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_get_classes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{nlp_get_classes}
 4 | \alias{nlp_get_classes}
 5 | \title{Get classes used to train a model}
 6 | \usage{
 7 | nlp_get_classes(model)
 8 | }
 9 | \arguments{
10 | \item{model}{a trained SparkNLP model that implements getClasses()}
11 | }
12 | \value{
13 | a list of classes
14 | }
15 | \description{
16 | Get classes used to train a model
17 | }
18 | 


--------------------------------------------------------------------------------
/man/nlp_graph_finisher.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/graph-finisher.R
 3 | \name{nlp_graph_finisher}
 4 | \alias{nlp_graph_finisher}
 5 | \title{Spark NLP GraphFinisher}
 6 | \usage{
 7 | nlp_graph_finisher(
 8 |   x,
 9 |   input_col,
10 |   output_col,
11 |   clean_annotations = NULL,
12 |   include_metadata = NULL,
13 |   output_as_array = NULL,
14 |   uid = random_string("graph_finisher_")
15 | )
16 | }
17 | \arguments{
18 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
19 | 
20 | \item{input_col}{Input column. String.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{clean_annotations}{Whether to remove annotation columns (Default: true)}
25 | 
26 | \item{include_metadata}{Annotation metadata format (Default: false)}
27 | 
28 | \item{output_as_array}{Finisher generates an Array with the results instead of string (Default: true)}
29 | 
30 | \item{uid}{A character string used to uniquely identify the ML estimator.}
31 | }
32 | \value{
33 | The object returned depends on the class of \code{x}.
34 | 
35 | \itemize{
36 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
37 | a Spark \code{Estimator} object and can be used to compose
38 | \code{Pipeline} objects.
39 | 
40 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
41 | the NLP estimator appended to the pipeline.
42 | 
43 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
44 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
45 | }
46 | }
47 | \description{
48 | Helper class to convert the knowledge graph from GraphExtraction into a generic format, such as RDF.
49 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#graphfinisher}
50 | }
51 | 


--------------------------------------------------------------------------------
/man/nlp_language_detector_dl_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/language-detector-dl.R
 3 | \name{nlp_language_detector_dl_pretrained}
 4 | \alias{nlp_language_detector_dl_pretrained}
 5 | \title{Load a pretrained Spark NLP LanguageDetectorDL model}
 6 | \usage{
 7 | nlp_language_detector_dl_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   alphabet = NULL,
12 |   coalesce_sentences = NULL,
13 |   language = NULL,
14 |   threshold = NULL,
15 |   threshold_label = NULL,
16 |   name = NULL,
17 |   lang = NULL,
18 |   remote_loc = NULL
19 | )
20 | }
21 | \arguments{
22 | \item{sc}{A Spark connection}
23 | 
24 | \item{input_cols}{Input columns. String array.}
25 | 
26 | \item{output_col}{Output column. String.}
27 | 
28 | \item{alphabet}{alphabet used to feed the TensorFlow model for prediction (Map of string to integer) This should be an R environment}
29 | 
30 | \item{coalesce_sentences}{If sets to true the output of all sentences will be averaged to one output instead of one output per sentence. (boolean)}
31 | 
32 | \item{language}{used to map prediction to two-letter (ISO 639-1) language codes (Map of string to integer) This should be an R environment}
33 | 
34 | \item{threshold}{The minimum threshold for the final result otheriwse it will be either Unknown or the value set in thresholdLabel.}
35 | 
36 | \item{threshold_label}{In case the score of prediction is less than threshold, what should be the label.}
37 | 
38 | \item{name}{the name of the model to load. If NULL will use the default value}
39 | 
40 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
41 | 
42 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
43 | }
44 | \value{
45 | The Spark NLP model with the pretrained model loaded
46 | }
47 | \description{
48 | Create a pretrained Spark NLP \code{LanguageDetectorDL} model
49 | }
50 | 


--------------------------------------------------------------------------------
/man/nlp_lemmatizer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lemmatizer.R
 3 | \name{nlp_lemmatizer}
 4 | \alias{nlp_lemmatizer}
 5 | \title{Spark NLP Lemmatizer}
 6 | \usage{
 7 | nlp_lemmatizer(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   dictionary_path = NULL,
12 |   dictionary_key_delimiter = "->",
13 |   dictionary_value_delimiter = "\\t",
14 |   dictionary_read_as = "TEXT",
15 |   dictionary_options = list(format = "text"),
16 |   uid = random_string("lemmatizer_")
17 | )
18 | }
19 | \arguments{
20 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
21 | 
22 | \item{input_cols}{Input columns. String array.}
23 | 
24 | \item{output_col}{Output column. String.}
25 | 
26 | \item{dictionary_path}{Path to lemma dictionary, in lemma vs possible words format.}
27 | 
28 | \item{dictionary_key_delimiter}{key delimiter in the dictionary file}
29 | 
30 | \item{dictionary_value_delimiter}{value delimiter in the dictionary file}
31 | 
32 | \item{dictionary_read_as}{readAs TEXT or SPARK_DATASET}
33 | 
34 | \item{dictionary_options}{options passed to the spark reader if read_as is SPARK_DATASET}
35 | 
36 | \item{uid}{A character string used to uniquely identify the ML estimator.}
37 | }
38 | \value{
39 | The object returned depends on the class of \code{x}.
40 | 
41 | \itemize{
42 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
43 | a Spark \code{Estimator} object and can be used to compose
44 | \code{Pipeline} objects.
45 | 
46 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
47 | a default pretrained NLP model appended to the pipeline.
48 | 
49 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
50 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
51 | }
52 | }
53 | \description{
54 | Spark ML estimator that retrieves lemmas out of words with the objective of returning a base dictionary word
55 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#lemmatizer}
56 | }
57 | 


--------------------------------------------------------------------------------
/man/nlp_lemmatizer_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lemmatizer.R
 3 | \name{nlp_lemmatizer_pretrained}
 4 | \alias{nlp_lemmatizer_pretrained}
 5 | \title{Load a pretrained Spark NLP model}
 6 | \usage{
 7 | nlp_lemmatizer_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Create a pretrained Spark NLP \code{LemmatizerModel} model
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_light_pipeline.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/light-pipeline.R
 3 | \name{nlp_light_pipeline}
 4 | \alias{nlp_light_pipeline}
 5 | \title{Spark NLP Light pipeline}
 6 | \usage{
 7 | nlp_light_pipeline(x, parse_embeddings = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{a trained (fitted) pipeline}
11 | 
12 | \item{parse_embeddings}{whether to parse the embeddings}
13 | }
14 | \value{
15 | a LightPipeline object
16 | }
17 | \description{
18 | LightPipelines are Spark ML pipelines converted into a single machine but multithreaded task, becoming more than
19 | 10x times faster for smaller amounts of data (small is relative, but 50k sentences is roughly a good maximum).
20 | To use them, simply plug in a trained (fitted) pipeline.
21 | }
22 | 


--------------------------------------------------------------------------------
/man/nlp_longformer_token_classification_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/longformer-for-token-classification.R
 3 | \name{nlp_longformer_token_classification_pretrained}
 4 | \alias{nlp_longformer_token_classification_pretrained}
 5 | \title{Spark NLP LongformerForTokenClassification}
 6 | \usage{
 7 | nlp_longformer_token_classification_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   batch_size = NULL,
12 |   case_sensitive = NULL,
13 |   max_sentence_length = NULL,
14 |   name = NULL,
15 |   lang = NULL,
16 |   remote_loc = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{batch_size}{Size of every batch (Default depends on model).}
25 | 
26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)}
27 | 
28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)}
29 | 
30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
31 | 
32 | \item{uid}{A character string used to uniquely identify the ML estimator.}
33 | }
34 | \value{
35 | The object returned depends on the class of \code{x}.
36 | 
37 | \itemize{
38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
39 | a Spark \code{Estimator} object and can be used to compose
40 | \code{Pipeline} objects.
41 | 
42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
43 | the NLP estimator appended to the pipeline.
44 | 
45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
46 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
47 | }
48 | }
49 | \description{
50 | LongformerForTokenClassification can load Longformer Models with a token classification head on top
51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#longformerfortokenclassification}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/nlp_marian_transformer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/marian_transformer.R
 3 | \name{nlp_marian_transformer}
 4 | \alias{nlp_marian_transformer}
 5 | \title{Spark NLP MarianTransformer}
 6 | \usage{
 7 | nlp_marian_transformer(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   lang_id = NULL,
12 |   max_input_length = NULL,
13 |   max_output_length = NULL,
14 |   vocabulary = NULL,
15 |   uid = random_string("marian_transformer_")
16 | )
17 | }
18 | \arguments{
19 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
20 | 
21 | \item{input_cols}{Input columns. String array.}
22 | 
23 | \item{output_col}{Output column. String.}
24 | 
25 | \item{lang_id}{A string representing the target language in the form of >>id<< (id = valid target language ID)}
26 | 
27 | \item{max_input_length}{Controls the maximum length for encoder inputs (source language texts) Default: 40}
28 | 
29 | \item{max_output_length}{Controls the maximum length for decoder outputs (target language texts) Default: 40}
30 | 
31 | \item{vocabulary}{Vocabulary used to encode and decode piece tokens generated by SentencePiece This will be set once the model is created and cannot be changed afterwards}
32 | 
33 | \item{uid}{A character string used to uniquely identify the ML estimator.}
34 | }
35 | \value{
36 | The object returned depends on the class of \code{x}.
37 | 
38 | \itemize{
39 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
40 | a Spark \code{Estimator} object and can be used to compose
41 | \code{Pipeline} objects.
42 | 
43 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
44 | the NLP estimator appended to the pipeline.
45 | 
46 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
47 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
48 | }
49 | }
50 | \description{
51 | Spark ML transformer that
52 | See \url{https://nlp.johnsnowlabs.com/api/#com.johnsnowlabs.nlp.annotators.seq2seq.MarianTransformer}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/nlp_marian_transformer_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/marian_transformer.R
 3 | \name{nlp_marian_transformer_pretrained}
 4 | \alias{nlp_marian_transformer_pretrained}
 5 | \title{Load a pretrained Spark NLP Marian Transformer model}
 6 | \usage{
 7 | nlp_marian_transformer_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Create a pretrained Spark NLP \code{MarianTransformerModel} model
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_medical_ner_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/medical-ner.R
 3 | \name{nlp_medical_ner_pretrained}
 4 | \alias{nlp_medical_ner_pretrained}
 5 | \title{Load a pretrained Spark NLP Medical NER model}
 6 | \usage{
 7 | nlp_medical_ner_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   include_confidence = NULL,
12 |   label_casing = NULL,
13 |   name = NULL,
14 |   lang = NULL,
15 |   remote_loc = NULL
16 | )
17 | }
18 | \arguments{
19 | \item{sc}{A Spark connection}
20 | 
21 | \item{input_cols}{Input columns. String array.}
22 | 
23 | \item{output_col}{Output column. String.}
24 | 
25 | \item{include_confidence}{whether to include confidence values}
26 | 
27 | \item{label_casing}{Set the tag to case sensitive or not.Setting all labels of the NER models upper/lower case.}
28 | 
29 | \item{name}{the name of the model to load. If NULL will use the default value}
30 | 
31 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
32 | 
33 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
34 | }
35 | \value{
36 | The Spark NLP model with the pretrained model loaded
37 | }
38 | \description{
39 | Create a pretrained Spark NLP \code{MedicalNerModel} model
40 | }
41 | 


--------------------------------------------------------------------------------
/man/nlp_multi_classifier_dl_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/multi_classifier_dl.R
 3 | \name{nlp_multi_classifier_dl_pretrained}
 4 | \alias{nlp_multi_classifier_dl_pretrained}
 5 | \title{Load a pretrained Spark NLP Multilabel Classifier DL model}
 6 | \usage{
 7 | nlp_multi_classifier_dl_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   threshold = NULL,
12 |   name = NULL,
13 |   lang = NULL,
14 |   remote_loc = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{sc}{A Spark connection}
19 | 
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{threshold}{the minimum threshold for each label to be accepted}
25 | 
26 | \item{name}{the name of the model to load. If NULL will use the default value}
27 | 
28 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
29 | 
30 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
31 | }
32 | \value{
33 | The Spark NLP model with the pretrained model loaded
34 | }
35 | \description{
36 | Create a pretrained Spark NLP \code{MultiClassifierDLModel} model
37 | }
38 | 


--------------------------------------------------------------------------------
/man/nlp_ner_chunker.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ner_chunker.R
 3 | \name{nlp_ner_chunker}
 4 | \alias{nlp_ner_chunker}
 5 | \title{Spark NLP NerChunker}
 6 | \usage{
 7 | nlp_ner_chunker(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   regex_parsers = NULL,
12 |   uid = random_string("ner_chunker_")
13 | )
14 | }
15 | \arguments{
16 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
17 | 
18 | \item{input_cols}{Input columns. String array.}
19 | 
20 | \item{output_col}{Output column. String.}
21 | 
22 | \item{regex_parsers}{A list of regex patterns to match chunks, for example: Array(“‹DT›?‹JJ›*‹NN›”)}
23 | 
24 | \item{uid}{A character string used to uniquely identify the ML estimator.}
25 | }
26 | \value{
27 | The object returned depends on the class of \code{x}.
28 | 
29 | \itemize{
30 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
31 | a Spark \code{Estimator} object and can be used to compose
32 | \code{Pipeline} objects.
33 | 
34 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
35 | the NLP estimator appended to the pipeline.
36 | 
37 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
38 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
39 | }
40 | }
41 | \description{
42 | Spark ML transformer that extracts phrases that fit into a known pattern using the NER tags
43 | See \url{https://nlp.johnsnowlabs.com/docs/en/licensed_release_notes#1-nerchunker}
44 | }
45 | 


--------------------------------------------------------------------------------
/man/nlp_ner_crf_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ner-crf.R
 3 | \name{nlp_ner_crf_pretrained}
 4 | \alias{nlp_ner_crf_pretrained}
 5 | \title{Load a pretrained Spark NLP NER CRF model}
 6 | \usage{
 7 | nlp_ner_crf_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Create a pretrained Spark NLP \code{NerCrfModel} model
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_ner_dl_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ner-dl.R
 3 | \name{nlp_ner_dl_pretrained}
 4 | \alias{nlp_ner_dl_pretrained}
 5 | \title{Load a pretrained Spark NLP NER DL model}
 6 | \usage{
 7 | nlp_ner_dl_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   include_confidence = NULL,
12 |   include_all_confidence_scores = NULL,
13 |   name = NULL,
14 |   lang = NULL,
15 |   remote_loc = NULL
16 | )
17 | }
18 | \arguments{
19 | \item{sc}{A Spark connection}
20 | 
21 | \item{input_cols}{Input columns. String array.}
22 | 
23 | \item{output_col}{Output column. String.}
24 | 
25 | \item{include_confidence}{whether to include confidence values}
26 | 
27 | \item{name}{the name of the model to load. If NULL will use the default value}
28 | 
29 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
30 | 
31 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
32 | }
33 | \value{
34 | The Spark NLP model with the pretrained model loaded
35 | }
36 | \description{
37 | Create a pretrained Spark NLP \code{NerDLModel} model
38 | }
39 | 


--------------------------------------------------------------------------------
/man/nlp_ngram_generator.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ngram-generator.R
 3 | \name{nlp_ngram_generator}
 4 | \alias{nlp_ngram_generator}
 5 | \title{Spark NLP NGramGenerator}
 6 | \usage{
 7 | nlp_ngram_generator(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   n = NULL,
12 |   enable_cumulative = NULL,
13 |   delimiter = NULL,
14 |   uid = random_string("ngram_generator_")
15 | )
16 | }
17 | \arguments{
18 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
19 | 
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{n}{number elements per n-gram (>=1)}
25 | 
26 | \item{enable_cumulative}{whether to calculate just the actual n-grams or all n-grams from 1 through n}
27 | 
28 | \item{delimiter}{glue character used to join the tokens}
29 | 
30 | \item{uid}{A character string used to uniquely identify the ML estimator.}
31 | }
32 | \value{
33 | The object returned depends on the class of \code{x}.
34 | 
35 | \itemize{
36 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
37 | a Spark \code{Estimator} object and can be used to compose
38 | \code{Pipeline} objects.
39 | 
40 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
41 | the NLP estimator appended to the pipeline.
42 | 
43 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
44 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
45 | }
46 | }
47 | \description{
48 | Spark ML transformer that takes as input a sequence of strings (e.g. the output of a Tokenizer, Normalizer, Stemmer,
49 | Lemmatizer, and StopWordsCleaner). The parameter n is used to determine the number of terms in each n-gram.
50 | The output will consist of a sequence of n-grams where each n-gram is represented by a space-delimited string of n
51 | consecutive words with annotatorType CHUNK same as the Chunker annotator.
52 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#ngramgenerator}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/nlp_norvig_spell_checker_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/norvig-spell-checker.R
 3 | \name{nlp_norvig_spell_checker_pretrained}
 4 | \alias{nlp_norvig_spell_checker_pretrained}
 5 | \title{Load a pretrained Spark NLP model}
 6 | \usage{
 7 | nlp_norvig_spell_checker_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Create a pretrained Spark NLP \code{NorvigSweetingModel} model
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_perceptron.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/perceptron.R
 3 | \name{nlp_perceptron}
 4 | \alias{nlp_perceptron}
 5 | \title{Spark NLP Perceptron}
 6 | \usage{
 7 | nlp_perceptron(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   n_iterations = NULL,
12 |   pos_column = NULL,
13 |   uid = random_string("perceptron_")
14 | )
15 | }
16 | \arguments{
17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{n_iterations}{Number of iterations for training. May improve accuracy but takes longer. Default 5.}
24 | 
25 | \item{pos_column}{Column containing an array of POS Tags matching every token on the line.}
26 | 
27 | \item{uid}{A character string used to uniquely identify the ML estimator.}
28 | }
29 | \value{
30 | The object returned depends on the class of \code{x}.
31 | 
32 | \itemize{
33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
34 | a Spark \code{Estimator} object and can be used to compose
35 | \code{Pipeline} objects.
36 | 
37 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
38 | a default pretrained NLP model appended to the pipeline.
39 | 
40 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
41 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
42 | }
43 | }
44 | \description{
45 | Spark ML transformer that sets a POS tag to each word within a sentence. Its train data (train_pos) is a spark
46 | dataset of POS format values with Annotation columns.
47 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#postagger}
48 | }
49 | 


--------------------------------------------------------------------------------
/man/nlp_perceptron_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/perceptron.R
 3 | \name{nlp_perceptron_pretrained}
 4 | \alias{nlp_perceptron_pretrained}
 5 | \title{Load a pretrained Spark NLP model}
 6 | \usage{
 7 | nlp_perceptron_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Create a pretrained Spark NLP \code{Perceptron} model
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_pos.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/perceptron.R
 3 | \name{nlp_pos}
 4 | \alias{nlp_pos}
 5 | \title{Read a part of speech tagging training file into a dataset}
 6 | \usage{
 7 | nlp_pos(
 8 |   sc,
 9 |   file_path,
10 |   delimiter = NULL,
11 |   output_pos_col = NULL,
12 |   output_document_col = NULL,
13 |   output_text_col = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{Spark connection}
18 | 
19 | \item{file_path}{path to the text file with the training data}
20 | 
21 | \item{delimiter}{the delimiter used in the training data}
22 | 
23 | \item{output_pos_col}{the pos column name for the output data frame}
24 | 
25 | \item{output_document_col}{the document column name for the output data frame}
26 | 
27 | \item{output_text_col}{the text column name for the output data frame}
28 | }
29 | \value{
30 | Spark dataframe containing the data
31 | }
32 | \description{
33 | In order to train a Part of Speech Tagger annotator, we need to get corpus data as a spark dataframe.
34 | This function does this: it reads a plain text file and transforms it to a spark dataset that is ready
35 | for training a POS tagger.
36 | See the Scala API docs for the default parameter values (
37 | \url{https://nlp.johnsnowlabs.com/api/index.html#com.johnsnowlabs.nlp.training.POS)}
38 | }
39 | 


--------------------------------------------------------------------------------
/man/nlp_pretrained_pipeline.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pretrained-pipeline.R
 3 | \name{nlp_pretrained_pipeline}
 4 | \alias{nlp_pretrained_pipeline}
 5 | \title{Spark NLP Pretrained pipeline}
 6 | \usage{
 7 | nlp_pretrained_pipeline(
 8 |   x,
 9 |   download_name,
10 |   lang = "en",
11 |   source = "public/models",
12 |   parse_embeddings_vectors = FALSE,
13 |   disk_location = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{x}{a Spark connection, Spark dataframe or string or character vector}
18 | 
19 | \item{download_name}{the name of the pretrained pipeline to download and create}
20 | 
21 | \item{lang}{the language of the pipeline}
22 | 
23 | \item{source}{the source for the pipeline file}
24 | 
25 | \item{parse_embeddings_vectors}{whether to parse the embeddings vectors or not}
26 | 
27 | \item{disk_location}{optional location on disk that the pipeline should be loaded from}
28 | }
29 | \value{
30 | The object returned depends on the class of \code{x}.
31 | 
32 | \itemize{
33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of
34 | a \code{ml_pipeline} created from the pretrained pipeline.
35 | 
36 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, a the pretrained pipeline is created and immediately
37 | run on the provied dataframe using \code{ml_fit_and_transform} returning the transformed data frame.
38 | }
39 | }
40 | \description{
41 | Creates a Spark NLP pretrained pipeline. See
42 | \url{https://nlp.johnsnowlabs.com/api/index.html#com.johnsnowlabs.nlp.pretrained.PretrainedPipeline} for the
43 | default values for the parameters if left null
44 | }
45 | 


--------------------------------------------------------------------------------
/man/nlp_pubtator_read_dataset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{nlp_pubtator_read_dataset}
 4 | \alias{nlp_pubtator_read_dataset}
 5 | \title{PubTator Dataset}
 6 | \usage{
 7 | nlp_pubtator_read_dataset(sc, path)
 8 | }
 9 | \arguments{
10 | \item{sc}{Spark connection}
11 | 
12 | \item{path}{path to a PubTator file}
13 | }
14 | \value{
15 | Spark Dataframe created from the PubTator file
16 | }
17 | \description{
18 | The PubTator format includes medical papers’ titles, abstracts, and tagged chunks
19 | (see \href{http://bioportal.bioontology.org/ontologies/EDAM?p=classes&conceptid=format_3783}{PubTator Docs} and
20 | \href{http://github.com/chanzuckerberg/MedMentions}{MedMentions Docs}
21 | for more information). We can create a Spark DataFrame from a PubTator text file.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/nlp_re_ner_chunks_filter.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/re_ner_chunks_filter.R
 3 | \name{nlp_re_ner_chunks_filter}
 4 | \alias{nlp_re_ner_chunks_filter}
 5 | \title{Spark NLP RENerChunksFilter}
 6 | \usage{
 7 | nlp_re_ner_chunks_filter(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   max_syntactic_distance = NULL,
12 |   relation_pairs,
13 |   uid = random_string("re_ner_chunks_filter_")
14 | )
15 | }
16 | \arguments{
17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{max_syntactic_distance}{Maximal syntactic distance, as threshold (Default: 0)}
24 | 
25 | \item{relation_pairs}{List of dash-separated pairs of named entities
26 | ("ENTITY1-ENTITY2", e.g. "Biomarker-RelativeDay"), which will be processed}
27 | 
28 | \item{uid}{A character string used to uniquely identify the ML estimator.}
29 | }
30 | \value{
31 | The object returned depends on the class of \code{x}.
32 | 
33 | \itemize{
34 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
35 | a Spark \code{Estimator} object and can be used to compose
36 | \code{Pipeline} objects.
37 | 
38 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
39 | the NLP estimator appended to the pipeline.
40 | 
41 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
42 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
43 | }
44 | }
45 | \description{
46 | Spark ML transformer that filters and outputs combinations of relations between
47 | extracted entities, for further processing. This annotator is especially useful
48 | to create inputs for the RelationExtractionDLModel.
49 | }
50 | \details{
51 | See \url{https://nlp.johnsnowlabs.com/docs/en/licensed_annotators#renerchunksfilter}
52 | }
53 | 


--------------------------------------------------------------------------------
/man/nlp_recursive_pipeline.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/recursive-pipeline.R
 3 | \name{nlp_recursive_pipeline}
 4 | \alias{nlp_recursive_pipeline}
 5 | \title{Spark NLP RecursivePipeline}
 6 | \usage{
 7 | nlp_recursive_pipeline(x, ..., uid = random_string("recursive_pipeline_"))
 8 | }
 9 | \arguments{
10 | \item{x}{Either a \code{spark_connection} or \code{ml_pipeline_stage} objects}
11 | 
12 | \item{...}{\code{ml_pipeline_stage} objects}
13 | 
14 | \item{uid}{uid for the pipeline}
15 | }
16 | \value{
17 | When \code{x} is a \code{spark_connection}, \code{ml_pipeline()} returns an empty pipeline object.
18 | When \code{x} is a \code{ml_pipeline_stage}, \code{ml_pipeline()} returns an \code{ml_pipeline} with the stages
19 | set to \code{x} and any transformers or estimators given in \code{...}.
20 | }
21 | \description{
22 | Recursive pipelines are SparkNLP specific pipelines that allow a Spark ML Pipeline to know about itself on every
23 | Pipeline Stage task, allowing annotators to utilize this same pipeline against external resources to process them
24 | in the same way the user decides. Only some of our annotators take advantage of this. RecursivePipeline behaves
25 | exactly the same than normal Spark ML pipelines, so they can be used with the same intention.
26 | See \url{https://nlp.johnsnowlabs.com/docs/en/concepts#recursivepipeline}
27 | }
28 | 


--------------------------------------------------------------------------------
/man/nlp_recursive_tokenizer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/recursive-tokenizer.R
 3 | \name{nlp_recursive_tokenizer}
 4 | \alias{nlp_recursive_tokenizer}
 5 | \title{Spark NLP RecursiveTokenizer}
 6 | \usage{
 7 | nlp_recursive_tokenizer(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   infixes = NULL,
12 |   prefixes = NULL,
13 |   suffixes = NULL,
14 |   white_list = NULL,
15 |   uid = random_string("recursive_tokenizer_")
16 | )
17 | }
18 | \arguments{
19 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
20 | 
21 | \item{input_cols}{Input columns. String array.}
22 | 
23 | \item{output_col}{Output column. String.}
24 | 
25 | \item{infixes}{strings that will be split when found at the middle of a token}
26 | 
27 | \item{prefixes}{strings that will be split when found at the beginning of a token}
28 | 
29 | \item{suffixes}{strings that will be split when found at the end of a token}
30 | 
31 | \item{white_list}{whitelist}
32 | 
33 | \item{uid}{A character string used to uniquely identify the ML estimator.}
34 | }
35 | \value{
36 | The object returned depends on the class of \code{x}.
37 | 
38 | \itemize{
39 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
40 | a Spark \code{Estimator} object and can be used to compose
41 | \code{Pipeline} objects.
42 | 
43 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
44 | the NLP estimator appended to the pipeline.
45 | 
46 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
47 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
48 | }
49 | }
50 | \description{
51 | Spark ML model that tokenizes
52 | See \url{https://nlp.johnsnowlabs.com/api/index#com.johnsnowlabs.nlp.annotators.RecursiveTokenizer}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/nlp_regex_matcher.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/regex_matcher.R
 3 | \name{nlp_regex_matcher}
 4 | \alias{nlp_regex_matcher}
 5 | \title{Spark NLP RegexMatcher}
 6 | \usage{
 7 | nlp_regex_matcher(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   strategy = NULL,
12 |   rules_path,
13 |   rules_path_delimiter,
14 |   rules_path_read_as = "TEXT",
15 |   rules_path_options = list(format = "text"),
16 |   uid = random_string("regex_matcher_")
17 | )
18 | }
19 | \arguments{
20 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
21 | 
22 | \item{input_cols}{Input columns. String array.}
23 | 
24 | \item{output_col}{Output column. String.}
25 | 
26 | \item{strategy}{Can be any of MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE}
27 | 
28 | \item{rules_path}{Path to file containing a set of regex,key pair}
29 | 
30 | \item{rules_path_delimiter}{delimiter between regex and key in the file}
31 | 
32 | \item{rules_path_read_as}{TEXT or SPARK_DATASET}
33 | 
34 | \item{rules_path_options}{options passed to Spark reader if read_as is SPARK_DATASET}
35 | 
36 | \item{uid}{A character string used to uniquely identify the ML estimator.}
37 | }
38 | \value{
39 | The object returned depends on the class of \code{x}.
40 | 
41 | \itemize{
42 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
43 | a Spark \code{Estimator} object and can be used to compose
44 | \code{Pipeline} objects.
45 | 
46 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
47 | the NLP estimator appended to the pipeline.
48 | 
49 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
50 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
51 | }
52 | }
53 | \description{
54 | Spark ML estimator that
55 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#regexmatcher}
56 | }
57 | 


--------------------------------------------------------------------------------
/man/nlp_relation_extraction_dl_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/relation_extraction_dl.R
 3 | \name{nlp_relation_extraction_dl_pretrained}
 4 | \alias{nlp_relation_extraction_dl_pretrained}
 5 | \title{Load a pretrained Spark NLP Relation Extraction DL model}
 6 | \usage{
 7 | nlp_relation_extraction_dl_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   prediction_threshold = NULL,
12 |   name = NULL,
13 |   lang = NULL,
14 |   remote_loc = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{sc}{A Spark connection}
19 | 
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{prediction_threshold}{Minimal activation of the target unit to encode a new relation instance (Default: 0.5f)}
25 | 
26 | \item{name}{the name of the model to load. If NULL will use the default value}
27 | 
28 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
29 | 
30 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
31 | }
32 | \value{
33 | The Spark NLP model with the pretrained model loaded
34 | }
35 | \description{
36 | Create a pretrained Spark NLP \code{RelationExtractionDLModel} model
37 | }
38 | 


--------------------------------------------------------------------------------
/man/nlp_relation_extraction_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/relation_extraction.R
 3 | \name{nlp_relation_extraction_pretrained}
 4 | \alias{nlp_relation_extraction_pretrained}
 5 | \title{Load a pretrained Spark NLP Relation Extraction model}
 6 | \usage{
 7 | nlp_relation_extraction_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   relation_pairs,
12 |   max_syntactic_distance = NULL,
13 |   feature_scaling = NULL,
14 |   prediction_threshold = NULL,
15 |   name = NULL,
16 |   lang = NULL,
17 |   remote_loc = NULL
18 | )
19 | }
20 | \arguments{
21 | \item{sc}{A Spark connection}
22 | 
23 | \item{input_cols}{Input columns. String array.}
24 | 
25 | \item{output_col}{Output column. String.}
26 | 
27 | \item{relation_pairs}{List of dash-separated pairs of named entities ("ENTITY1-ENTITY2",
28 | e.g. "Biomarker-RelativeDay"), which will be processed}
29 | 
30 | \item{max_syntactic_distance}{Maximal syntactic distance, as threshold (Default: 0)}
31 | 
32 | \item{feature_scaling}{Feature scaling method.}
33 | 
34 | \item{prediction_threshold}{Minimal activation of the target unit to encode a new relation instance (Default: 0.5f)}
35 | 
36 | \item{name}{the name of the model to load. If NULL will use the default value}
37 | 
38 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
39 | 
40 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
41 | }
42 | \value{
43 | The Spark NLP model with the pretrained model loaded
44 | }
45 | \description{
46 | Create a pretrained Spark NLP \code{RelationExtractionModel} model
47 | }
48 | 


--------------------------------------------------------------------------------
/man/nlp_resource_downloader.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/resource_downloader.R
 3 | \name{nlp_resource_downloader}
 4 | \alias{nlp_resource_downloader}
 5 | \alias{ResourceDownloader}
 6 | \alias{nlp_show_public_pipelines}
 7 | \alias{nlp_show_public_models}
 8 | \alias{nlp_clear_cache}
 9 | \alias{nlp_show_available_annotators}
10 | \title{SparkNLP ResourceDownloader functions}
11 | \usage{
12 | nlp_show_public_pipelines(sc, lang = NULL, version = NULL)
13 | 
14 | nlp_show_public_models(sc, annotator = NULL, lang = NULL, version = NULL)
15 | 
16 | nlp_clear_cache(sc, name = NULL, language = NULL, remote_loc = NULL)
17 | 
18 | nlp_show_available_annotators(sc)
19 | }
20 | \arguments{
21 | \item{sc}{a spark_connect object}
22 | 
23 | \item{lang}{language to restrict the results to}
24 | 
25 | \item{version}{Spark NLP version to restrict results to}
26 | 
27 | \item{annotator}{name of annotator to restrict results}
28 | 
29 | \item{name}{name of object to clear}
30 | 
31 | \item{language}{language to clear}
32 | 
33 | \item{remote_loc}{remote_loc of models to clear}
34 | }
35 | \value{
36 | a markdown table containing the models or pipelines filtered by the provided arguments
37 | }
38 | \description{
39 | ResourceDownloader provides functions to easily look for pretrained models & pipelines
40 | inside Spark NLP. You can filter models or pipelines via language, version,
41 | or the name of the annotator
42 | }
43 | 


--------------------------------------------------------------------------------
/man/nlp_roberta_embeddings_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/roberta-embeddings.R
 3 | \name{nlp_roberta_embeddings_pretrained}
 4 | \alias{nlp_roberta_embeddings_pretrained}
 5 | \title{Spark NLP RoBertaEmbeddings}
 6 | \usage{
 7 | nlp_roberta_embeddings_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   batch_size = NULL,
12 |   case_sensitive = NULL,
13 |   dimension = NULL,
14 |   max_sentence_length = NULL,
15 |   storage_ref = NULL,
16 |   name = NULL,
17 |   lang = NULL,
18 |   remote_loc = NULL
19 | )
20 | }
21 | \arguments{
22 | \item{input_cols}{Input columns. String array.}
23 | 
24 | \item{output_col}{Output column. String.}
25 | 
26 | \item{batch_size}{Size of every batch (Default depends on model).}
27 | 
28 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)}
29 | 
30 | \item{dimension}{Number of embedding dimensions (Default depends on model)}
31 | 
32 | \item{max_sentence_length}{Max sentence length to process (Default: 128)}
33 | 
34 | \item{storage_ref}{Unique identifier for storage (Default: this.uid)}
35 | 
36 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
37 | 
38 | \item{uid}{A character string used to uniquely identify the ML estimator.}
39 | }
40 | \value{
41 | The object returned depends on the class of \code{x}.
42 | 
43 | \itemize{
44 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
45 | a Spark \code{Estimator} object and can be used to compose
46 | \code{Pipeline} objects.
47 | 
48 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
49 | the NLP estimator appended to the pipeline.
50 | 
51 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
52 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
53 | }
54 | }
55 | \description{
56 | Spark ML transformer that
57 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#robertaembeddings}
58 | }
59 | 


--------------------------------------------------------------------------------
/man/nlp_roberta_sentence_embeddings_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/roberta_sentence_embeddings.R
 3 | \name{nlp_roberta_sentence_embeddings_pretrained}
 4 | \alias{nlp_roberta_sentence_embeddings_pretrained}
 5 | \title{Load a pretrained Spark NLP RoBertaSentenceEmbeddings model}
 6 | \usage{
 7 | nlp_roberta_sentence_embeddings_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   case_sensitive = NULL,
12 |   batch_size = NULL,
13 |   dimension = NULL,
14 |   max_sentence_length = NULL,
15 |   name = NULL,
16 |   lang = NULL,
17 |   remote_loc = NULL
18 | )
19 | }
20 | \arguments{
21 | \item{sc}{A Spark connection}
22 | 
23 | \item{input_cols}{Input columns. String array.}
24 | 
25 | \item{output_col}{Output column. String.}
26 | 
27 | \item{case_sensitive}{whether to lowercase tokens or not}
28 | 
29 | \item{batch_size}{batch size}
30 | 
31 | \item{dimension}{defines the output layer of BERT when calculating embeddings}
32 | 
33 | \item{max_sentence_length}{max sentence length to process}
34 | 
35 | \item{name}{the name of the model to load. If NULL will use the default value}
36 | 
37 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
38 | 
39 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
40 | }
41 | \value{
42 | The Spark NLP model with the pretrained model loaded
43 | }
44 | \description{
45 | Create a pretrained Spark NLP \code{RoBertaSentenceEmbeddings} model.
46 | Sentence-level embeddings using RoBERTa. The RoBERTa model was proposed in
47 | RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott,
48 | Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis,
49 | Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model
50 | released in 2018.
51 | }
52 | \details{
53 | It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with much larger mini-batches and learning rates.
54 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#robertabertsentenceembeddings}
55 | }
56 | 


--------------------------------------------------------------------------------
/man/nlp_roberta_token_classification_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/roberta-for-token-classification.R
 3 | \name{nlp_roberta_token_classification_pretrained}
 4 | \alias{nlp_roberta_token_classification_pretrained}
 5 | \title{Spark NLP RoBertaForTokenClassification}
 6 | \usage{
 7 | nlp_roberta_token_classification_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   batch_size = NULL,
12 |   case_sensitive = NULL,
13 |   max_sentence_length = NULL,
14 |   name = NULL,
15 |   lang = NULL,
16 |   remote_loc = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{batch_size}{Size of every batch (Default depends on model).}
25 | 
26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)}
27 | 
28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)}
29 | 
30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
31 | 
32 | \item{uid}{A character string used to uniquely identify the ML estimator.}
33 | }
34 | \value{
35 | The object returned depends on the class of \code{x}.
36 | 
37 | \itemize{
38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
39 | a Spark \code{Estimator} object and can be used to compose
40 | \code{Pipeline} objects.
41 | 
42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
43 | the NLP estimator appended to the pipeline.
44 | 
45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
46 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
47 | }
48 | }
49 | \description{
50 | RoBertaForTokenClassification can load RoBERTa Models with a token classification head on top
51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#xlnetfortokenclassification}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/nlp_sentence_detector_dl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sentence_detector_dl.R
 3 | \name{nlp_sentence_detector_dl}
 4 | \alias{nlp_sentence_detector_dl}
 5 | \title{Spark NLP SentenceDetectorDLApproach}
 6 | \usage{
 7 | nlp_sentence_detector_dl(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   epochs_number = NULL,
12 |   impossible_penultimates = NULL,
13 |   model = NULL,
14 |   output_logs_path = NULL,
15 |   validation_split = NULL,
16 |   explode_sentences = NULL,
17 |   uid = random_string("sentence_detector_dl_")
18 | )
19 | }
20 | \arguments{
21 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
22 | 
23 | \item{input_cols}{Input columns. String array.}
24 | 
25 | \item{output_col}{Output column. String.}
26 | 
27 | \item{epochs_number}{maximum number of epochs to train}
28 | 
29 | \item{impossible_penultimates}{impossible penultimates}
30 | 
31 | \item{model}{model architecture}
32 | 
33 | \item{output_logs_path}{path to folder to output logs}
34 | 
35 | \item{validation_split}{choose the proportion of training dataset to be validated agaisnt the model on each epoch}
36 | 
37 | \item{explode_sentences}{a flag indicating whether to split sentences into different Dataset rows.}
38 | 
39 | \item{uid}{A character string used to uniquely identify the ML estimator.}
40 | }
41 | \value{
42 | The object returned depends on the class of \code{x}.
43 | 
44 | \itemize{
45 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
46 | a Spark \code{Estimator} object and can be used to compose
47 | \code{Pipeline} objects.
48 | 
49 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
50 | the NLP estimator appended to the pipeline.
51 | 
52 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
53 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
54 | }
55 | }
56 | \description{
57 | Spark ML estimator that
58 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators}
59 | }
60 | 


--------------------------------------------------------------------------------
/man/nlp_sentence_detector_dl_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sentence_detector_dl.R
 3 | \name{nlp_sentence_detector_dl_pretrained}
 4 | \alias{nlp_sentence_detector_dl_pretrained}
 5 | \title{Load a pretrained Spark NLP Sentence Detector DL model}
 6 | \usage{
 7 | nlp_sentence_detector_dl_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   impossible_penultimates = NULL,
12 |   model = NULL,
13 |   explode_sentences = NULL,
14 |   name = NULL,
15 |   lang = NULL,
16 |   remote_loc = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{sc}{A Spark connection}
21 | 
22 | \item{input_cols}{Input columns. String array.}
23 | 
24 | \item{output_col}{Output column. String.}
25 | 
26 | \item{impossible_penultimates}{impossible penultimates}
27 | 
28 | \item{model}{model architecture}
29 | 
30 | \item{explode_sentences}{a flag indicating whether to split sentences into different Dataset rows}
31 | 
32 | \item{name}{the name of the model to load. If NULL will use the default value}
33 | 
34 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
35 | 
36 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
37 | }
38 | \value{
39 | The Spark NLP model with the pretrained model loaded
40 | }
41 | \description{
42 | Create a pretrained Spark NLP \code{SentenceDetectorDLModel} model
43 | }
44 | 


--------------------------------------------------------------------------------
/man/nlp_sentence_embeddings.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sentence-embeddings.R
 3 | \name{nlp_sentence_embeddings}
 4 | \alias{nlp_sentence_embeddings}
 5 | \title{Spark NLP SentenceEmbeddings}
 6 | \usage{
 7 | nlp_sentence_embeddings(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   pooling_strategy = NULL,
12 |   storage_ref = NULL,
13 |   uid = random_string("sentence_embeddings_")
14 | )
15 | }
16 | \arguments{
17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{pooling_strategy}{Choose how you would like to aggregate Word Embeddings to Sentence Embeddings: AVERAGE or SUM}
24 | 
25 | \item{storage_ref}{storage reference for the embeddings}
26 | 
27 | \item{uid}{A character string used to uniquely identify the ML estimator.}
28 | }
29 | \value{
30 | The object returned depends on the class of \code{x}.
31 | 
32 | \itemize{
33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
34 | a Spark \code{Estimator} object and can be used to compose
35 | \code{Pipeline} objects.
36 | 
37 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
38 | the NLP estimator appended to the pipeline.
39 | 
40 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
41 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
42 | }
43 | }
44 | \description{
45 | Spark ML transformer that converts the results from WordEmbeddings or BertEmbeddings into sentence or document
46 | embeddings by either summing up or averaging all the word embeddings in a sentence or a document
47 | (depending on the input_cols).
48 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#sentenceembeddings}
49 | }
50 | 


--------------------------------------------------------------------------------
/man/nlp_sentence_entity_resolver_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sentence_entity_resolver.R
 3 | \name{nlp_sentence_entity_resolver_pretrained}
 4 | \alias{nlp_sentence_entity_resolver_pretrained}
 5 | \title{Load a pretrained Spark NLP T5 Transformer model}
 6 | \usage{
 7 | nlp_sentence_entity_resolver_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   case_sensitive = NULL,
12 |   confidence_function = NULL,
13 |   distance_function = NULL,
14 |   miss_as_empty = NULL,
15 |   neighbors = NULL,
16 |   threshold = NULL,
17 |   name = NULL,
18 |   lang = NULL,
19 |   remote_loc = NULL
20 | )
21 | }
22 | \arguments{
23 | \item{sc}{A Spark connection}
24 | 
25 | \item{input_cols}{Input columns. String array.}
26 | 
27 | \item{output_col}{Output column. String.}
28 | 
29 | \item{case_sensitive}{whether to treat the entities as case sensitive}
30 | 
31 | \item{confidence_function}{what function to use to calculate confidence: INVERSE or SOFTMAX}
32 | 
33 | \item{distance_function}{what distance function to use for KNN: 'EUCLIDEAN' or 'COSINE'}
34 | 
35 | \item{miss_as_empty}{whether or not to return an empty annotation on unmatched chunks}
36 | 
37 | \item{neighbors}{number of neighbours to consider in the KNN query to calculate WMD}
38 | 
39 | \item{threshold}{threshold value for the aggregated distance}
40 | 
41 | \item{name}{the name of the model to load. If NULL will use the default value}
42 | 
43 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
44 | 
45 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
46 | }
47 | \value{
48 | The Spark NLP model with the pretrained model loaded
49 | }
50 | \description{
51 | Create a pretrained Spark NLP \code{T5TransformerModel} model
52 | }
53 | 


--------------------------------------------------------------------------------
/man/nlp_sentiment_dl_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sentiment-dl.R
 3 | \name{nlp_sentiment_dl_pretrained}
 4 | \alias{nlp_sentiment_dl_pretrained}
 5 | \title{Load a pretrained Spark NLP Sentiment DL model}
 6 | \usage{
 7 | nlp_sentiment_dl_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Create a pretrained Spark NLP \code{SentimentDLModel} model
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_set_input_cols.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{nlp_set_input_cols}
 4 | \alias{nlp_set_input_cols}
 5 | \title{Set the input column names}
 6 | \usage{
 7 | nlp_set_input_cols(jobj, input_cols)
 8 | }
 9 | \arguments{
10 | \item{jobj}{the object setting the input columns on}
11 | 
12 | \item{input_cols}{the input column names}
13 | }
14 | \value{
15 | the jobj object with the input columns set
16 | }
17 | \description{
18 | Set the input column names
19 | }
20 | 


--------------------------------------------------------------------------------
/man/nlp_set_output_col.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{nlp_set_output_col}
 4 | \alias{nlp_set_output_col}
 5 | \title{Set the output column name}
 6 | \usage{
 7 | nlp_set_output_col(jobj, output_col)
 8 | }
 9 | \arguments{
10 | \item{jobj}{the object setting the input columns on}
11 | 
12 | \item{output_col}{the input column name}
13 | }
14 | \value{
15 | the jobj object with the output column set
16 | }
17 | \description{
18 | Set the output column name
19 | }
20 | 


--------------------------------------------------------------------------------
/man/nlp_set_param.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{nlp_set_param}
 4 | \alias{nlp_set_param}
 5 | \title{Set a parameter on an NLP model object}
 6 | \usage{
 7 | nlp_set_param(x, param, value)
 8 | }
 9 | \arguments{
10 | \item{x}{A Spark NLP object, either a pipeline stage or an annotator}
11 | 
12 | \item{param}{The parameter to set}
13 | 
14 | \item{value}{The value to use when setting the parameter}
15 | }
16 | \value{
17 | the NLP model object with the parameter set
18 | }
19 | \description{
20 | Set a parameter on an NLP model object
21 | }
22 | 


--------------------------------------------------------------------------------
/man/nlp_set_param_tuple2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{nlp_set_param_tuple2}
 4 | \alias{nlp_set_param_tuple2}
 5 | \title{Set a Tuple2 parameter on an NLP model object}
 6 | \usage{
 7 | nlp_set_param_tuple2(x, param, value)
 8 | }
 9 | \arguments{
10 | \item{x}{A Spark NLP object, either a pipeline stage or an annotator}
11 | 
12 | \item{param}{The parameter to set}
13 | 
14 | \item{value}{The value to use when setting the parameter. This should be a list of size 2}
15 | }
16 | \value{
17 | the NLP model object with the parameter set
18 | }
19 | \description{
20 | Set a Tuple2 parameter on an NLP model object
21 | }
22 | 


--------------------------------------------------------------------------------
/man/nlp_spark_annotation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotation.R
 3 | \name{nlp_spark_annotation}
 4 | \alias{nlp_spark_annotation}
 5 | \title{Create a Spark NLP Annotation object inside of Spark}
 6 | \usage{
 7 | nlp_spark_annotation(
 8 |   sc,
 9 |   annotatorType,
10 |   begin,
11 |   end,
12 |   result,
13 |   metadata,
14 |   embeddings = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{sc}{A \code{spark_connection}}
19 | 
20 | \item{annotatorType}{the type of annotation (string)}
21 | 
22 | \item{begin}{the index of the first character under this annotation (integer)}
23 | 
24 | \item{end}{the index after the last character under this annotation (integer)}
25 | 
26 | \item{result}{the main output of the annotation (string)}
27 | 
28 | \item{metadata}{associated metadata for this annotation (named list)}
29 | 
30 | \item{embeddings}{vector of embeddings (Array(Float)). Currently unimplemented.}
31 | }
32 | \value{
33 | the Spark NLP Annotation object
34 | }
35 | \description{
36 | This S3 generic is used for a Spark NLP Annotation object that exists inside of
37 | a Spark session.
38 | }
39 | \seealso{
40 | \url{https://nlp.johnsnowlabs.com/docs/en/concepts#annotation}
41 | }
42 | 


--------------------------------------------------------------------------------
/man/nlp_stemmer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stemmer.R
 3 | \name{nlp_stemmer}
 4 | \alias{nlp_stemmer}
 5 | \title{Spark NLP Stemmer}
 6 | \usage{
 7 | nlp_stemmer(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   language = NULL,
12 |   uid = random_string("stemmer_")
13 | )
14 | }
15 | \arguments{
16 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
17 | 
18 | \item{input_cols}{Input columns. String array.}
19 | 
20 | \item{output_col}{Output column. String.}
21 | 
22 | \item{language}{language to use}
23 | 
24 | \item{uid}{A character string used to uniquely identify the ML estimator.}
25 | }
26 | \value{
27 | The object returned depends on the class of \code{x}.
28 | 
29 | \itemize{
30 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
31 | a Spark \code{Estimator} object and can be used to compose
32 | \code{Pipeline} objects.
33 | 
34 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
35 | the NLP estimator appended to the pipeline.
36 | 
37 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
38 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
39 | }
40 | }
41 | \description{
42 | Spark ML transformer that returns hard-stems out of words with the objective of retrieving the meaningful part of the word
43 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#stemmer}
44 | }
45 | 


--------------------------------------------------------------------------------
/man/nlp_stop_words_cleaner.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stop_words_cleaner.R
 3 | \name{nlp_stop_words_cleaner}
 4 | \alias{nlp_stop_words_cleaner}
 5 | \title{Spark NLP StopWordsCleaner}
 6 | \usage{
 7 | nlp_stop_words_cleaner(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   case_sensitive = NULL,
12 |   locale = NULL,
13 |   stop_words = NULL,
14 |   uid = random_string("stop_words_cleaner_")
15 | )
16 | }
17 | \arguments{
18 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
19 | 
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{case_sensitive}{Whether to do a case sensitive comparison over the stop words.}
25 | 
26 | \item{locale}{Locale of the input for case insensitive matching. Ignored when caseSensitive is true.}
27 | 
28 | \item{stop_words}{The words to be filtered out.}
29 | 
30 | \item{uid}{A character string used to uniquely identify the ML estimator.}
31 | }
32 | \value{
33 | The object returned depends on the class of \code{x}.
34 | 
35 | \itemize{
36 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
37 | a Spark \code{Estimator} object and can be used to compose
38 | \code{Pipeline} objects.
39 | 
40 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
41 | the NLP estimator appended to the pipeline.
42 | 
43 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
44 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
45 | }
46 | }
47 | \description{
48 | Spark ML transformer that excludes from a sequence of strings (e.g. the output of a Tokenizer, Normalizer,
49 | Lemmatizer, and Stemmer) and drops all the stop words from the input sequences.
50 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#stopwordscleaner}
51 | }
52 | 


--------------------------------------------------------------------------------
/man/nlp_symmetric_delete_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/symmetric-delete.R
 3 | \name{nlp_symmetric_delete_pretrained}
 4 | \alias{nlp_symmetric_delete_pretrained}
 5 | \title{Load a pretrained Spark NLP model}
 6 | \usage{
 7 | nlp_symmetric_delete_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Create a pretrained Spark NLP \code{SymmetricDeleteModel} model
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_t5_transformer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/t5_transformer.R
 3 | \name{nlp_t5_transformer}
 4 | \alias{nlp_t5_transformer}
 5 | \title{Spark NLP T5Transformer}
 6 | \usage{
 7 | nlp_t5_transformer(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   task = NULL,
12 |   max_output_length = NULL,
13 |   uid = random_string("t5_transformer_")
14 | )
15 | }
16 | \arguments{
17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{task}{name to give the task being performed}
24 | 
25 | \item{max_output_length}{maximum output length}
26 | 
27 | \item{uid}{A character string used to uniquely identify the ML estimator.}
28 | }
29 | \value{
30 | The object returned depends on the class of \code{x}.
31 | 
32 | \itemize{
33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
34 | a Spark \code{Estimator} object and can be used to compose
35 | \code{Pipeline} objects.
36 | 
37 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
38 | the NLP estimator appended to the pipeline.
39 | 
40 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
41 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
42 | }
43 | }
44 | \description{
45 | Spark ML transformer that
46 | See \url{https://nlp.johnsnowlabs.com/api/#com.johnsnowlabs.nlp.annotators.seq2seq.T5Transformer}
47 | }
48 | 


--------------------------------------------------------------------------------
/man/nlp_t5_transformer_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/t5_transformer.R
 3 | \name{nlp_t5_transformer_pretrained}
 4 | \alias{nlp_t5_transformer_pretrained}
 5 | \title{Load a pretrained Spark NLP T5 Transformer model}
 6 | \usage{
 7 | nlp_t5_transformer_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   task = NULL,
12 |   max_output_length = NULL,
13 |   name = NULL,
14 |   lang = NULL,
15 |   remote_loc = NULL
16 | )
17 | }
18 | \arguments{
19 | \item{sc}{A Spark connection}
20 | 
21 | \item{input_cols}{Input columns. String array.}
22 | 
23 | \item{output_col}{Output column. String.}
24 | 
25 | \item{task}{name to give the task being performed}
26 | 
27 | \item{max_output_length}{the maximum output length}
28 | 
29 | \item{name}{the name of the model to load. If NULL will use the default value}
30 | 
31 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
32 | 
33 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
34 | }
35 | \value{
36 | The Spark NLP model with the pretrained model loaded
37 | }
38 | \description{
39 | Create a pretrained Spark NLP \code{T5TransformerModel} model
40 | }
41 | 


--------------------------------------------------------------------------------
/man/nlp_token_assembler.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/token-assembler.R
 3 | \name{nlp_token_assembler}
 4 | \alias{nlp_token_assembler}
 5 | \title{Spark NLP TokenAssembler}
 6 | \usage{
 7 | nlp_token_assembler(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   uid = random_string("token_assembler_")
12 | )
13 | }
14 | \arguments{
15 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
16 | 
17 | \item{input_cols}{Input columns. String array.}
18 | 
19 | \item{output_col}{Output column. String.}
20 | 
21 | \item{uid}{A character string used to uniquely identify the ML estimator.}
22 | }
23 | \value{
24 | The object returned depends on the class of \code{x}.
25 | 
26 | \itemize{
27 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
28 | a Spark \code{Estimator} object and can be used to compose
29 | \code{Pipeline} objects.
30 | 
31 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
32 | the NLP estimator appended to the pipeline.
33 | 
34 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
35 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
36 | }
37 | }
38 | \description{
39 | Spark ML transformer that
40 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#tokenassembler-getting-data-reshaped}
41 | }
42 | 


--------------------------------------------------------------------------------
/man/nlp_typed_dependency_parser_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/typed-dependency-parser.R
 3 | \name{nlp_typed_dependency_parser_pretrained}
 4 | \alias{nlp_typed_dependency_parser_pretrained}
 5 | \title{Load a pretrained Spark NLP model}
 6 | \usage{
 7 | nlp_typed_dependency_parser_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Create a pretrained Spark NLP \code{TypedDependencyParserModel} model
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_univ_sent_encoder.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/univ_sent_encoder.R
 3 | \name{nlp_univ_sent_encoder}
 4 | \alias{nlp_univ_sent_encoder}
 5 | \title{Spark NLP UniversalSentenceEncoder}
 6 | \usage{
 7 | nlp_univ_sent_encoder(
 8 |   x,
 9 |   input_cols,
10 |   output_col,
11 |   dimension = NULL,
12 |   uid = random_string("univ_sent_encoder_")
13 | )
14 | }
15 | \arguments{
16 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
17 | 
18 | \item{input_cols}{Input columns. String array.}
19 | 
20 | \item{output_col}{Output column. String.}
21 | 
22 | \item{dimension}{dimension to use for the embeddings}
23 | 
24 | \item{uid}{A character string used to uniquely identify the ML estimator.}
25 | }
26 | \value{
27 | The object returned depends on the class of \code{x}.
28 | 
29 | \itemize{
30 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
31 | a Spark \code{Estimator} object and can be used to compose
32 | \code{Pipeline} objects.
33 | 
34 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
35 | the NLP estimator appended to the pipeline.
36 | 
37 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
38 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
39 | }
40 | }
41 | \description{
42 | Spark ML transformer that encodes text into high dimensional vectors that can be used for text classification,
43 | semantic similarity, clustering and other natural language tasks.
44 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#universalsentenceencoder}
45 | }
46 | 


--------------------------------------------------------------------------------
/man/nlp_univ_sent_encoder_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/univ_sent_encoder.R
 3 | \name{nlp_univ_sent_encoder_pretrained}
 4 | \alias{nlp_univ_sent_encoder_pretrained}
 5 | \title{Load pretrained universal sentence encoder}
 6 | \usage{
 7 | nlp_univ_sent_encoder_pretrained(
 8 |   sc,
 9 |   input_cols = NULL,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Loads pretrained universal sentence encoder into a Spark NLP annotator
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_version.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{nlp_version}
 4 | \alias{nlp_version}
 5 | \title{Spark NLP version}
 6 | \usage{
 7 | nlp_version()
 8 | }
 9 | \value{
10 | the version of the Spark NLP library in use
11 | }
12 | \description{
13 | Spark NLP version
14 | }
15 | 


--------------------------------------------------------------------------------
/man/nlp_vivekn_sentiment_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/vivekn-sentiment-detector.R
 3 | \name{nlp_vivekn_sentiment_pretrained}
 4 | \alias{nlp_vivekn_sentiment_pretrained}
 5 | \title{Load a pretrained Spark NLP model}
 6 | \usage{
 7 | nlp_vivekn_sentiment_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{sc}{A Spark connection}
18 | 
19 | \item{input_cols}{Input columns. String array.}
20 | 
21 | \item{output_col}{Output column. String.}
22 | 
23 | \item{name}{the name of the model to load. If NULL will use the default value}
24 | 
25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
26 | 
27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
28 | }
29 | \value{
30 | The Spark NLP model with the pretrained model loaded
31 | }
32 | \description{
33 | Create a pretrained Spark NLP \code{ViveknSentimentModel} model
34 | }
35 | 


--------------------------------------------------------------------------------
/man/nlp_word_embeddings_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/word-embeddings.R
 3 | \name{nlp_word_embeddings_model}
 4 | \alias{nlp_word_embeddings_model}
 5 | \title{Create a Spark NLP WordEmbeddingsModel}
 6 | \usage{
 7 | nlp_word_embeddings_model(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   storage_ref = NULL,
12 |   dimension,
13 |   case_sensitive = NULL,
14 |   include_storage = NULL,
15 |   lazy_annotator = NULL,
16 |   read_cache_size = NULL,
17 |   include_embeddings = NULL,
18 |   uid = random_string("word_embeddings_")
19 | )
20 | }
21 | \arguments{
22 | \item{sc}{Spark connection}
23 | 
24 | \item{input_cols}{Input columns. String array.}
25 | 
26 | \item{output_col}{Output column. String.}
27 | 
28 | \item{storage_ref}{binding to NerDLModel trained by that embeddings}
29 | 
30 | \item{dimension}{number of word embeddings dimensions}
31 | 
32 | \item{case_sensitive}{whether to ignore case in tokens for embeddings matching}
33 | 
34 | \item{include_storage}{include the storage}
35 | 
36 | \item{lazy_annotator}{boolean for laziness}
37 | 
38 | \item{read_cache_size}{size for the read cache}
39 | 
40 | \item{include_embeddings}{whether or not to include word embeddings when saving this annotator to disk (single or within pipeline)}
41 | 
42 | \item{uid}{unique identifier for this instance}
43 | }
44 | \value{
45 | a Spark transformer WordEmbeddingsModel
46 | }
47 | \description{
48 | This function creates a WordEmbeddingsModel which uses the provided embeddings_ref.
49 | }
50 | 


--------------------------------------------------------------------------------
/man/nlp_word_embeddings_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/word-embeddings.R
 3 | \name{nlp_word_embeddings_pretrained}
 4 | \alias{nlp_word_embeddings_pretrained}
 5 | \title{Load pretrained word embeddings}
 6 | \usage{
 7 | nlp_word_embeddings_pretrained(
 8 |   sc,
 9 |   input_cols = NULL,
10 |   output_col,
11 |   name = NULL,
12 |   lang = NULL,
13 |   remote_loc = NULL,
14 |   case_sensitive = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{sc}{A Spark connection}
19 | 
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{name}{the name of the model to load. If NULL will use the default value}
25 | 
26 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
27 | 
28 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
29 | 
30 | \item{case_sensitive}{whether to treat the words as case sensitive}
31 | }
32 | \value{
33 | The Spark NLP model with the pretrained model loaded
34 | }
35 | \description{
36 | Loads pretrained word embeddings into a Spark NLP annotator
37 | }
38 | 


--------------------------------------------------------------------------------
/man/nlp_xlm_roberta_embeddings_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/xlm-roberta-embeddings.R
 3 | \name{nlp_xlm_roberta_embeddings_pretrained}
 4 | \alias{nlp_xlm_roberta_embeddings_pretrained}
 5 | \title{Spark NLP XlmRoBertaEmbeddings}
 6 | \usage{
 7 | nlp_xlm_roberta_embeddings_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   batch_size = NULL,
12 |   case_sensitive = NULL,
13 |   dimension = NULL,
14 |   max_sentence_length = NULL,
15 |   storage_ref = NULL,
16 |   name = NULL,
17 |   lang = NULL,
18 |   remote_loc = NULL
19 | )
20 | }
21 | \arguments{
22 | \item{input_cols}{Input columns. String array.}
23 | 
24 | \item{output_col}{Output column. String.}
25 | 
26 | \item{batch_size}{Size of every batch (Default depends on model).}
27 | 
28 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)}
29 | 
30 | \item{dimension}{Number of embedding dimensions (Default depends on model)}
31 | 
32 | \item{max_sentence_length}{Max sentence length to process (Default: 128)}
33 | 
34 | \item{storage_ref}{Unique identifier for storage (Default: this.uid)}
35 | 
36 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
37 | 
38 | \item{uid}{A character string used to uniquely identify the ML estimator.}
39 | }
40 | \value{
41 | The object returned depends on the class of \code{x}.
42 | 
43 | \itemize{
44 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
45 | a Spark \code{Estimator} object and can be used to compose
46 | \code{Pipeline} objects.
47 | 
48 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
49 | the NLP estimator appended to the pipeline.
50 | 
51 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
52 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
53 | }
54 | }
55 | \description{
56 | Spark ML transformer that
57 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#xmlrobertaembeddings}
58 | }
59 | 


--------------------------------------------------------------------------------
/man/nlp_xlm_roberta_sentence_embeddings_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/xlm_roberta_sentence_embeddings.R
 3 | \name{nlp_xlm_roberta_sentence_embeddings_pretrained}
 4 | \alias{nlp_xlm_roberta_sentence_embeddings_pretrained}
 5 | \title{Load a pretrained Spark NLP XlmRoBertaSentenceEmbeddings model}
 6 | \usage{
 7 | nlp_xlm_roberta_sentence_embeddings_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   case_sensitive = NULL,
12 |   batch_size = NULL,
13 |   dimension = NULL,
14 |   max_sentence_length = NULL,
15 |   name = NULL,
16 |   lang = NULL,
17 |   remote_loc = NULL
18 | )
19 | }
20 | \arguments{
21 | \item{sc}{A Spark connection}
22 | 
23 | \item{input_cols}{Input columns. String array.}
24 | 
25 | \item{output_col}{Output column. String.}
26 | 
27 | \item{case_sensitive}{whether to lowercase tokens or not}
28 | 
29 | \item{batch_size}{batch size}
30 | 
31 | \item{dimension}{defines the output layer of BERT when calculating embeddings}
32 | 
33 | \item{max_sentence_length}{max sentence length to process}
34 | 
35 | \item{name}{the name of the model to load. If NULL will use the default value}
36 | 
37 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
38 | 
39 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
40 | }
41 | \value{
42 | The Spark NLP model with the pretrained model loaded
43 | }
44 | \description{
45 | Create a pretrained Spark NLP \code{XlmRoBertaSentenceEmbeddings} model.
46 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#xlmrobertasentenceembeddings}
47 | }
48 | \details{
49 | Sentence-level embeddings using XLM-RoBERTa. The XLM-RoBERTa model was proposed in
50 | Unsupervised Cross-lingual Representation Learning at Scale by Alexis Conneau,
51 | Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco GuzmÃ¡n,
52 | Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on
53 | Facebook's RoBERTa model released in 2019. It is a large multi-lingual language model,
54 | trained on 2.5TB of filtered CommonCrawl data.
55 | }
56 | 


--------------------------------------------------------------------------------
/man/nlp_xlnet_embeddings_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/xlnet-embeddings.R
 3 | \name{nlp_xlnet_embeddings_pretrained}
 4 | \alias{nlp_xlnet_embeddings_pretrained}
 5 | \title{Load a pretrained Spark NLP XlnetEmbeddings model}
 6 | \usage{
 7 | nlp_xlnet_embeddings_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   case_sensitive = NULL,
12 |   batch_size = NULL,
13 |   dimension = NULL,
14 |   lazy_annotator = NULL,
15 |   max_sentence_length = NULL,
16 |   storage_ref = NULL,
17 |   name = NULL,
18 |   lang = NULL,
19 |   remote_loc = NULL
20 | )
21 | }
22 | \arguments{
23 | \item{sc}{A Spark connection}
24 | 
25 | \item{input_cols}{Input columns. String array.}
26 | 
27 | \item{output_col}{Output column. String.}
28 | 
29 | \item{case_sensitive}{whether to treat the tokens as case insensitive when looking up their embedding}
30 | 
31 | \item{batch_size}{batch size}
32 | 
33 | \item{dimension}{the embedding dimension}
34 | 
35 | \item{lazy_annotator}{use as a lazy annotator or not}
36 | 
37 | \item{max_sentence_length}{set the maximum sentence length}
38 | 
39 | \item{storage_ref}{storage reference name}
40 | 
41 | \item{name}{the name of the model to load. If NULL will use the default value}
42 | 
43 | \item{lang}{the language of the model to be loaded. If NULL will use the default value}
44 | 
45 | \item{remote_loc}{the remote location of the model. If NULL will use the default value}
46 | }
47 | \value{
48 | The Spark NLP model with the pretrained model loaded
49 | }
50 | \description{
51 | Create a pretrained Spark NLP \code{XlnetEmbeddings} model
52 | }
53 | 


--------------------------------------------------------------------------------
/man/nlp_xlnet_token_classification_pretrained.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/xlnet-for-token-classification.R
 3 | \name{nlp_xlnet_token_classification_pretrained}
 4 | \alias{nlp_xlnet_token_classification_pretrained}
 5 | \title{Spark NLP XlnetForTokenClassification}
 6 | \usage{
 7 | nlp_xlnet_token_classification_pretrained(
 8 |   sc,
 9 |   input_cols,
10 |   output_col,
11 |   batch_size = NULL,
12 |   case_sensitive = NULL,
13 |   max_sentence_length = NULL,
14 |   name = NULL,
15 |   lang = NULL,
16 |   remote_loc = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{input_cols}{Input columns. String array.}
21 | 
22 | \item{output_col}{Output column. String.}
23 | 
24 | \item{batch_size}{Size of every batch (Default depends on model).}
25 | 
26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)}
27 | 
28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)}
29 | 
30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}
31 | 
32 | \item{uid}{A character string used to uniquely identify the ML estimator.}
33 | }
34 | \value{
35 | The object returned depends on the class of \code{x}.
36 | 
37 | \itemize{
38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to
39 | a Spark \code{Estimator} object and can be used to compose
40 | \code{Pipeline} objects.
41 | 
42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
43 | the NLP estimator appended to the pipeline.
44 | 
45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then
46 | immediately fit with the input \code{tbl_spark}, returning an NLP model.
47 | }
48 | }
49 | \description{
50 | XlnetForTokenClassification can load XLNet Models with a token classification head on top
51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#xlnetfortokenclassification}
53 | }
54 | 


--------------------------------------------------------------------------------
/man/set_nlp_version.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{set_nlp_version}
 4 | \alias{set_nlp_version}
 5 | \title{Set the version of the Spark NLP library to use}
 6 | \usage{
 7 | set_nlp_version(version)
 8 | }
 9 | \arguments{
10 | \item{version}{Spark NLP version number to use when starting Spark Session}
11 | }
12 | \description{
13 | Set the version of the Spark NLP library to use
14 | }
15 | 


--------------------------------------------------------------------------------
/sparknlp.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | PackageRoxygenize: rd,collate,namespace
19 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(sparklyr)
 3 | library(sparknlp)
 4 | 
 5 | if (identical(Sys.getenv("NOT_CRAN"), "true")) {
 6 |   test_check("sparknlp")
 7 |   on.exit({spark_disconnect_all()})
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/tests/testthat/.gitignore:
--------------------------------------------------------------------------------
1 | metastore_db
2 | 


--------------------------------------------------------------------------------
/tests/testthat/data/.gitignore:
--------------------------------------------------------------------------------
1 | dist.psv
2 | .result.conll.crc
3 | result.conll
4 | 


--------------------------------------------------------------------------------
/tests/testthat/data/dependency_treebank/wsj_0001.dp:
--------------------------------------------------------------------------------
 1 | Pierre	NNP	2
 2 | Vinken	NNP	8
 3 | ,	,	2
 4 | 61	CD	5
 5 | years	NNS	6
 6 | old	JJ	2
 7 | ,	,	2
 8 | will	MD	0
 9 | join	VB	8
10 | the	DT	11
11 | board	NN	9
12 | as	IN	9
13 | a	DT	15
14 | nonexecutive	JJ	15
15 | director	NN	12
16 | Nov.	NNP	9
17 | 29	CD	16
18 | .	.	8
19 | 
20 | Mr.	NNP	2
21 | Vinken	NNP	3
22 | is	VBZ	0
23 | chairman	NN	3
24 | of	IN	4
25 | Elsevier	NNP	7
26 | N.V.	NNP	12
27 | ,	,	12
28 | the	DT	12
29 | Dutch	NNP	12
30 | publishing	VBG	12
31 | group	NN	5
32 | .	.	3
33 | 


--------------------------------------------------------------------------------
/tests/testthat/data/en.test.conllu:
--------------------------------------------------------------------------------
 1 | # newdoc id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200
 2 | # sent_id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-0001
 3 | # newpar id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-p0001
 4 | # text = What if Google Morphed Into GoogleOS?
 5 | 1	What	what	PRON	WP	PronType=Int	0	root	0:root	_
 6 | 2	if	if	SCONJ	IN	_	4	mark	4:mark	_
 7 | 3	Google	Google	PROPN	NNP	Number=Sing	4	nsubj	4:nsubj	_
 8 | 4	Morphed	morph	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	1	advcl	1:advcl:if	_
 9 | 5	Into	into	ADP	IN	_	6	case	6:case	_
10 | 6	GoogleOS	GoogleOS	PROPN	NNP	Number=Sing	4	obl	4:obl:into	SpaceAfter=No
11 | 7	?	?	PUNCT	.	_	4	punct	4:punct	_
12 | 
13 | # newdoc id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200
14 | # sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0003
15 | # text = Google is a nice search engine.
16 | 1	Google	Google	PROPN	NNP	Number=Sing	6	nsubj	6:nsubj	_
17 | 2	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	6	cop	6:cop	_
18 | 3	a	a	DET	DT	Definite=Ind|PronType=Art	6	det	6:det	_
19 | 4	nice	nice	ADJ	JJ	Degree=Pos	6	amod	6:amod	_
20 | 5	search	search	NOUN	NN	Number=Sing	6	compound	6:compound	_
21 | 6	engine	engine	NOUN	NN	Number=Sing	0	root	0:root	SpaceAfter=No
22 | 7	.	.	PUNCT	.	_	6	punct	6:punct	_
23 | 
24 | # sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0004
25 | # text = Does anybody use it for anything else?
26 | 1	Does	do	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	aux	3:aux	_
27 | 2	anybody	anybody	PRON	NN	Number=Sing	3	nsubj	3:nsubj	_
28 | 3	use	use	VERB	VB	VerbForm=Inf	0	root	0:root	_
29 | 4	it	it	PRON	PRP	Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs	3	obj	3:obj	_
30 | 5	for	for	ADP	IN	_	6	case	6:case	_
31 | 6	anything	anything	PRON	NN	Number=Sing	3	obl	3:obl:for	_
32 | 7	else	else	ADJ	JJ	Degree=Pos	6	amod	6:amod	SpaceAfter=No
33 | 8	?	?	PUNCT	.	_	3	punct	3:punct	_


--------------------------------------------------------------------------------
/tests/testthat/data/entities.txt:
--------------------------------------------------------------------------------
1 | i think
2 | Feeling strangely
3 | guitar lessons


--------------------------------------------------------------------------------
/tests/testthat/data/entity_ruler/patterns.csv:
--------------------------------------------------------------------------------
1 | PERSON|Jon
2 | PERSON|John
3 | PERSON|John Snow
4 | LOCATION|Winterfell


--------------------------------------------------------------------------------
/tests/testthat/data/gender.csv:
--------------------------------------------------------------------------------
1 | male,man,male,boy,gentleman,he,him
2 | female,woman,female,girl,lady,old-lady,she,her
3 | neutral,neutral


--------------------------------------------------------------------------------
/tests/testthat/data/gender.json:
--------------------------------------------------------------------------------
1 | {
2 |   "entity": "Gender",
3 |   "ruleScope": "sentence", 
4 |   "completeMatchRegex": "true"
5 | }
6 | 


--------------------------------------------------------------------------------
/tests/testthat/data/pos_corpus.txt:
--------------------------------------------------------------------------------
1 | the|DT cats|NNS are|VBP laying|VBG in|IN front|JJ of|IN the|DT fireplace|NN .|. the|DT dogs|NNS are|VBP
2 | staying|VBG cool|NN in|IN the|DT kitchen|NN .|.
3 | 


--------------------------------------------------------------------------------
/tests/testthat/data/re_train.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/data/re_train.parquet


--------------------------------------------------------------------------------
/tests/testthat/data/regex_match.txt:
--------------------------------------------------------------------------------
1 | the\\s\\w+, followed by ‘the'
2 | 


--------------------------------------------------------------------------------
/tests/testthat/data/sentiment.csv:
--------------------------------------------------------------------------------
1 | text,label
2 | This movie is the best movie I have wached ever! In my opinion this movie can win an award.,0
3 | This was a terrible movie! The acting was bad really bad!,1


--------------------------------------------------------------------------------
/tests/testthat/data/sentiment.parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/tests/testthat/data/sentiment.parquet/.part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/data/sentiment.parquet/.part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/tests/testthat/data/sentiment.parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/data/sentiment.parquet/_SUCCESS


--------------------------------------------------------------------------------
/tests/testthat/data/sentiment.parquet/part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/data/sentiment.parquet/part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet


--------------------------------------------------------------------------------
/tests/testthat/data/sentiment_dictionary.txt:
--------------------------------------------------------------------------------
1 | superb,positive
2 | bad,negative
3 | lack of, revert
4 | very,increment
5 | barely,decrement
6 | 


--------------------------------------------------------------------------------
/tests/testthat/data/words.txt:
--------------------------------------------------------------------------------
 1 | abacay
 2 | abacas
 3 | abacate
 4 | abacaxi
 5 | abaci
 6 | abacinate
 7 | abacination
 8 | abacisci
 9 | abaciscus
10 | abacist
11 | aback
12 | abacli
13 | Abaco
14 | abacot
15 | abacterial
16 | abactinal
17 | abactinally
18 | abaction
19 | abactor
20 | abaculi
21 | abaculus
22 | abacus
23 | abacuses
24 | Abad
25 | abada
26 | Abadan
27 | Abaddon
28 | abadejo
29 | abadengo
30 | abadia
31 | Abadite
32 | abaff
33 | abaft
34 | Abagael
35 | Abagail
36 | Abagtha
37 | abay
38 | abayah
39 | Abailard
40 | abaisance
41 | abaised
42 | abaiser
43 | abaisse
44 | abaissed
45 | abaka
46 | Abakan
47 | abakas
48 | Abakumov
49 | abalation
50 | abalienate
51 | abalienated
52 | abalienating
53 | abalienation
54 | abalone
55 | abalones
56 | Abama
57 | abamp
58 | abampere
59 | abamperes
60 | abamps
61 | Abana


--------------------------------------------------------------------------------
/tests/testthat/testthat-albert-embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   # config <- spark_config()
 4 |   # config$`sparklyr.shell.driver-memory` <- "8G"
 5 |   # sc <- spark_connect(master = "local", version = "2.4.3", config = config)
 6 |   text_tbl <- testthat_tbl("test_text")
 7 | 
 8 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 9 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
10 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
11 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
12 | 
13 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
14 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
15 | 
16 |   assign("sc", sc, envir = parent.frame())
17 |   assign("pipeline", pipeline, envir = parent.frame())
18 |   assign("test_data", test_data, envir = parent.frame())
19 | })
20 | 
21 | teardown({
22 |   spark_disconnect(sc)
23 |   rm(sc, envir = .GlobalEnv)
24 |   rm(pipeline, envir = .GlobalEnv)
25 |   rm(test_data, envir = .GlobalEnv)
26 | })
27 | 
28 | test_that("nlp_albert_embeddings pretrained", {
29 |   model <- nlp_albert_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "albert")
30 |   transformed_data <- ml_transform(model, test_data)
31 |   expect_true("albert" %in% colnames(transformed_data))
32 | })
33 | 
34 | test_that("nlp_albert_embeddings load", {
35 |   model_files <- list.files("~/cache_pretrained/")
36 |   albert_model_file <- max(Filter(function(s) startsWith(s, "albert_base"), model_files))
37 |   model <- ml_load(sc, paste0("~/cache_pretrained/", albert_model_file))
38 |   transformed_data <- ml_transform(model, test_data)
39 |   expect_true("albert" %in% colnames(transformed_data))
40 | })
41 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-albert-for-token-classification.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   rm(sc, envir = .GlobalEnv)
20 |   rm(pipeline, envir = .GlobalEnv)
21 |   rm(test_data, envir = .GlobalEnv)
22 | })
23 | 
24 | test_that("nlp_albert_token_classification pretrained", {
25 |   model <- nlp_albert_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert")
26 |   transformed_data <- ml_transform(model, test_data)
27 |   expect_true("bert" %in% colnames(transformed_data))
28 | })
29 | 
30 | test_that("nlp_albert_token_classification load", {
31 |   model_files <- list.files("~/cache_pretrained/")
32 |   bert_model_file <- max(Filter(function(s) startsWith(s, "albert_base_token"), model_files))
33 |   model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
34 |   model <- nlp_set_output_col(model, "label")
35 |   transformed_data <- ml_transform(model, test_data)
36 |   expect_true("label" %in% colnames(transformed_data))
37 | })


--------------------------------------------------------------------------------
/tests/testthat/testthat-annotation_tool_json_reader.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   assign("sc", sc, envir = parent.frame())
 4 |   
 5 |   if (file.exists(here::here("tests", "testthat", "data", "result.conll"))) {
 6 |     file.remove(here::here("tests", "testthat", "data", "result.conll"))
 7 |   }
 8 | })
 9 | 
10 | teardown({
11 |   rm(sc, envir = .GlobalEnv)
12 | })
13 | 
14 | test_that("nlp_generate_assertion_read_dataset", {
15 |   train_data_file <- here::here("tests", "testthat", "data", "result.json")
16 |   reader <- nlp_annotation_tool_json_reader(sc)
17 |   mydf <- nlp_annotation_read_dataset(reader, train_data_file)
18 |   
19 |   expect_true("ner_label" %in% colnames(mydf))
20 | })
21 | 
22 | test_that("nlp_generate_assertion_train_set", {
23 |  train_data_file <- here::here("tests", "testthat", "data", "result.json")
24 |  reader <- nlp_annotation_tool_json_reader(sc)
25 |  mydf <- nlp_annotation_read_dataset(reader, train_data_file)
26 |  train_df <- nlp_generate_assertion_train_set(reader, mydf)
27 | 
28 |  expect_true("target" %in% colnames(train_df))
29 | })
30 | 
31 | test_that("nlp_generate_plain_assertion_train_set", {
32 |  train_data_file <- here::here("tests", "testthat", "data", "result.json")
33 |  reader <- nlp_annotation_tool_json_reader(sc)
34 |  mydf <- nlp_annotation_read_dataset(reader, train_data_file)
35 |  train_df <- nlp_generate_plain_assertion_train_set(reader, mydf)
36 | 
37 |  expect_true("assertion" %in% colnames(train_df))
38 | })
39 | 
40 | test_that("nlp_generate_colln", {
41 |  train_data_file <- here::here("tests", "testthat", "data", "result.json")
42 |  reader <- nlp_annotation_tool_json_reader(sc)
43 |  mydf <- nlp_annotation_read_dataset(reader, train_data_file)
44 |  train_df <- nlp_generate_colln(reader, mydf, here::here("tests", "testthat", "data", "result.conll"))
45 | 
46 |  expect_true(file.exists(here::here("tests", "testthat", "data", "result.conll")))
47 | })
48 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-bert-embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   # config <- spark_config()
 4 |   # config$`sparklyr.shell.driver-memory` <- "8G"
 5 |   # sc <- spark_connect(master = "local", version = "2.4.3", config = config)
 6 |   text_tbl <- testthat_tbl("test_text")
 7 | 
 8 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 9 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
10 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
11 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
12 | 
13 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
14 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
15 | 
16 |   assign("sc", sc, envir = parent.frame())
17 |   assign("pipeline", pipeline, envir = parent.frame())
18 |   assign("test_data", test_data, envir = parent.frame())
19 | })
20 | 
21 | teardown({
22 |   spark_disconnect(sc)
23 |   rm(sc, envir = .GlobalEnv)
24 |   rm(pipeline, envir = .GlobalEnv)
25 |   rm(test_data, envir = .GlobalEnv)
26 | })
27 | 
28 | test_that("nlp_bert_embeddings pretrained", {
29 |   model <- nlp_bert_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert")
30 |   transformed_data <- ml_transform(model, test_data)
31 |   expect_true("bert" %in% colnames(transformed_data))
32 | })
33 | 
34 | test_that("nlp_bert_embeddings load", {
35 |  model_files <- list.files("~/cache_pretrained/")
36 |  bert_model_file <- max(Filter(function(s) startsWith(s, "small_bert"), model_files))
37 |  model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
38 |  transformed_data <- ml_transform(model, test_data)
39 |  expect_true("bert" %in% colnames(transformed_data))
40 | })
41 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-bert-for-token-classification.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   rm(sc, envir = .GlobalEnv)
20 |   rm(pipeline, envir = .GlobalEnv)
21 |   rm(test_data, envir = .GlobalEnv)
22 | })
23 | 
24 | test_that("nlp_bert_token_classification pretrained", {
25 |   model <- nlp_bert_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert")
26 |   transformed_data <- ml_transform(model, test_data)
27 |   expect_true("bert" %in% colnames(transformed_data))
28 | })
29 | 
30 | test_that("nlp_bert_token_classification load", {
31 |   model_files <- list.files("~/cache_pretrained/")
32 |   bert_model_file <- max(Filter(function(s) startsWith(s, "bert_base_token"), model_files))
33 |   model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
34 |   model <- nlp_set_output_col(model, "label")
35 |   transformed_data <- ml_transform(model, test_data)
36 |   expect_true("label" %in% colnames(transformed_data))
37 | })


--------------------------------------------------------------------------------
/tests/testthat/testthat-bert_sentence_chunk_embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentence <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 |   word_embeddings <- nlp_bert_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "word_embeddings",
10 |                                                     name = "biobert_pubmed_base_cased")
11 |   ner_model <- nlp_medical_ner_pretrained(sc, input_cols = c("sentence", "token", "word_embeddings"), output_col = "ner",
12 |                                name = "ner_clinical_biobert", remote_loc = "clinical/models")
13 |   ner_converter <- nlp_ner_converter(sc, input_cols = c("sentence", "token", "ner"), output_col = "ner_chunk")
14 | 
15 |   pipeline <- ml_pipeline(assembler, sentence, tokenizer, word_embeddings, ner_model, ner_converter)
16 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
17 | 
18 |   assign("sc", sc, envir = parent.frame())
19 |   assign("pipeline", pipeline, envir = parent.frame())
20 |   assign("test_data", test_data, envir = parent.frame())
21 | })
22 | 
23 | teardown({
24 |   spark_disconnect(sc)
25 |   rm(sc, envir = .GlobalEnv)
26 |   rm(pipeline, envir = .GlobalEnv)
27 |   rm(test_data, envir = .GlobalEnv)
28 | })
29 | 
30 | test_that("nlp_bert_sentence_embeddings pretrained", {
31 |   model <- nlp_bert_sentence_chunk_embeddings_pretrained(sc, input_cols = c("sentence", "ner_chunk"), output_col = "bert_sentence_chunk_embeddings")
32 |   transformed_data <- ml_transform(model, test_data)
33 |   expect_true("bert_sentence_chunk_embeddings" %in% colnames(transformed_data))
34 |   
35 |   expect_true(inherits(model, "nlp_bert_sentence_chunk_embeddings"))
36 | })
37 | 
38 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-bert_sentence_embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 | 
 8 |   pipeline <- ml_pipeline(assembler)
 9 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
10 | 
11 |   assign("sc", sc, envir = parent.frame())
12 |   assign("pipeline", pipeline, envir = parent.frame())
13 |   assign("test_data", test_data, envir = parent.frame())
14 | })
15 | 
16 | teardown({
17 |   spark_disconnect(sc)
18 |   rm(sc, envir = .GlobalEnv)
19 |   rm(pipeline, envir = .GlobalEnv)
20 |   rm(test_data, envir = .GlobalEnv)
21 | })
22 | 
23 | test_that("nlp_bert_sentence_embeddings pretrained", {
24 |   model <- nlp_bert_sentence_embeddings_pretrained(sc, input_cols = c("document"), output_col = "bert_sentence_embeddings")
25 |   transformed_data <- ml_transform(model, test_data)
26 |   expect_true("bert_sentence_embeddings" %in% colnames(transformed_data))
27 |   
28 |   expect_true(inherits(model, "nlp_bert_sentence_embeddings"))
29 | })
30 | 
31 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-chunk2token.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 |   ngram <- nlp_ngram_generator(sc, input_cols = c("token"), output_col = "ngram", n = 2)
10 | 
11 | 
12 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer, ngram)
13 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
14 | 
15 |   assign("sc", sc, envir = parent.frame())
16 |   assign("pipeline", pipeline, envir = parent.frame())
17 |   assign("test_data", test_data, envir = parent.frame())
18 | })
19 | 
20 | teardown({
21 |   rm(sc, envir = .GlobalEnv)
22 |   rm(pipeline, envir = .GlobalEnv)
23 |   rm(test_data, envir = .GlobalEnv)
24 | })
25 | 
26 | test_that("chunk2token param setting", {
27 |   test_args <- list(
28 |     input_cols = c("string1"),
29 |     output_col = "string1"
30 |   )
31 | 
32 |   test_param_setting(sc, nlp_chunk2token, test_args)
33 | })
34 | 
35 | test_that("nlp_chunk2token spark_connection", {
36 |   test_annotator <- nlp_chunk2token(sc, input_cols = c("ngram"), output_col = "token_chunk")
37 |   transformed_data <- ml_transform(test_annotator, test_data)
38 |   expect_true("token_chunk" %in% colnames(transformed_data))
39 |   
40 |   expect_true(inherits(test_annotator, "nlp_chunk2token"))
41 | })
42 | 
43 | test_that("nlp_chunk2token ml_pipeline", {
44 |   test_annotator <- nlp_chunk2token(pipeline, input_cols = c("ngram"), output_col = "token_chunk")
45 |   transformed_data <- ml_fit_and_transform(test_annotator, test_data)
46 |   expect_true("token_chunk" %in% colnames(transformed_data))
47 | })
48 | 
49 | test_that("nlp_chunk2token tbl_spark", {
50 |   transformed_data <- nlp_chunk2token(test_data, input_cols = c("ngram"), output_col = "token_chunk")
51 |   expect_true("token_chunk" %in% colnames(transformed_data))
52 | })
53 | 
54 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-chunker.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 |   pos <- nlp_perceptron_pretrained(sc, input_cols = c("sentence", "token"), output_col = "pos")
10 | 
11 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer, pos)
12 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
13 | 
14 |   assign("sc", sc, envir = parent.frame())
15 |   assign("pipeline", pipeline, envir = parent.frame())
16 |   assign("test_data", test_data, envir = parent.frame())
17 | })
18 | 
19 | teardown({
20 |   spark_disconnect(sc)
21 |   rm(sc, envir = .GlobalEnv)
22 |   rm(pipeline, envir = .GlobalEnv)
23 |   rm(test_data, envir = .GlobalEnv)
24 | })
25 | 
26 | test_that("nlp_chunker param setting", {
27 |   test_args <- list(
28 |     input_cols = c("sentence", "pos"),
29 |     output_col = "chunk",
30 |     regex_parsers = c("<DT>?<JJ>*<NN>", "<NNP>+")
31 |   )
32 | 
33 |   test_param_setting(sc, nlp_chunker, test_args)
34 | })
35 | 
36 | test_that("nlp_nlp_chunker spark_connection", {
37 |   test_annotator <- nlp_chunker(sc, input_cols = c("sentence","pos"), output_col = "chunk")
38 |   transformed_data <- ml_transform(test_annotator, test_data)
39 |   expect_true("chunk" %in% colnames(transformed_data))
40 | })
41 | 
42 | test_that("nlp_nlp_chunker ml_pipeline", {
43 |   test_annotator <- nlp_chunker(pipeline, input_cols = c("sentence","pos"), output_col = "chunk")
44 |   transformed_data <- ml_fit_and_transform(test_annotator, test_data)
45 |   expect_true("chunk" %in% colnames(transformed_data))
46 | })
47 | 
48 | test_that("nlp_nlp_chunker tbl_spark", {
49 |   transformed_data <- nlp_chunker(test_data, input_cols = c("sentence","pos"), output_col = "chunk")
50 |   expect_true("chunk" %in% colnames(transformed_data))
51 | })
52 | 
53 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-distilbert-embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   rm(sc, envir = .GlobalEnv)
20 |   rm(pipeline, envir = .GlobalEnv)
21 |   rm(test_data, envir = .GlobalEnv)
22 | })
23 | 
24 | test_that("nlp_distilbert_embeddings pretrained", {
25 |   model <- nlp_distilbert_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "distilbert")
26 |   transformed_data <- ml_transform(model, test_data)
27 |   expect_true("distilbert" %in% colnames(transformed_data))
28 | })
29 | 
30 | test_that("nlp_distilbert_embeddings load", {
31 |   model_files <- list.files("~/cache_pretrained/")
32 |   bert_model_file <- max(Filter(function(s) startsWith(s, "distilbert_base"), model_files))
33 |   model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
34 |   transformed_data <- ml_transform(model, test_data)
35 |   expect_true("embeddings" %in% colnames(transformed_data))
36 | })
37 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-distilbert-for-token-classification.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("document"), output_col = "token")
 9 |   # TODO: put other annotators here as needed
10 | 
11 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
12 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
13 | 
14 |   assign("sc", sc, envir = parent.frame())
15 |   assign("pipeline", pipeline, envir = parent.frame())
16 |   assign("test_data", test_data, envir = parent.frame())
17 | })
18 | 
19 | teardown({
20 |   rm(sc, envir = .GlobalEnv)
21 |   rm(pipeline, envir = .GlobalEnv)
22 |   rm(test_data, envir = .GlobalEnv)
23 | })
24 | 
25 | 
26 | test_that("nlp_distilbert_token_classification pretrained", {
27 |   model <- nlp_distilbert_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "distilbert")
28 |   transformed_data <- ml_transform(model, test_data)
29 |   expect_true("distilbert" %in% colnames(transformed_data))
30 | })
31 | 
32 | test_that("nlp_distilbert_token_classification load", {
33 |   model_files <- list.files("~/cache_pretrained/")
34 |   bert_model_file <- max(Filter(function(s) startsWith(s, "distilbert_base_token"), model_files))
35 |   model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
36 |   model <- nlp_set_output_col(model, "label")
37 |   transformed_data <- ml_transform(model, test_data)
38 |   expect_true("label" %in% colnames(transformed_data))
39 | })
40 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-document-assembler.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 |   assign("sc", sc, envir = parent.frame())
 5 |   assign("text_tbl", text_tbl, envir = parent.frame())
 6 | })
 7 | 
 8 | teardown({
 9 |   spark_disconnect(sc)
10 |   rm(sc, envir = .GlobalEnv)
11 |   rm(text_tbl, envir = .GlobalEnv)
12 | })
13 | 
14 | test_that("nlp_document_assembler() param setting", {
15 |   test_args <- list(
16 |     input_col = "text", 
17 |     output_col = "document",
18 |     id_col = "rowkey", 
19 |     metadata_col = "met",
20 |     cleanup_mode = "shrink")
21 |   test_param_setting(sc, nlp_document_assembler, test_args)
22 | })
23 | 
24 | test_that("nlp_document_assembler() spark_connection", {
25 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
26 |   transformed_data <- ml_transform(assembler, text_tbl)
27 |   
28 |   expect_true("document" %in% colnames(transformed_data))
29 | })
30 | 
31 | test_that("nlp_document_assembler() ml_pipeline", {
32 |   pipeline <- ml_pipeline(sc)
33 |   assembler <- nlp_document_assembler(pipeline, input_col = "text", output_col = "document")
34 |   
35 |   transformed_data <- ml_fit_and_transform(assembler, text_tbl)
36 |   
37 |   expect_true("document" %in% colnames(transformed_data))
38 | })
39 | 
40 | test_that("nlp_document_assembler() tbl_spark", {
41 |   transformed_data <- nlp_document_assembler(text_tbl, input_col = "text", output_col = "document")
42 |   expect_true("document" %in% colnames(transformed_data))
43 | })
44 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-drug_normalizer.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 | 
 8 |   pipeline <- ml_pipeline(assembler)
 9 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
10 | 
11 |   assign("sc", sc, envir = parent.frame())
12 |   assign("pipeline", pipeline, envir = parent.frame())
13 |   assign("test_data", test_data, envir = parent.frame())
14 | })
15 | 
16 | teardown({
17 |   rm(sc, envir = .GlobalEnv)
18 |   rm(pipeline, envir = .GlobalEnv)
19 |   rm(test_data, envir = .GlobalEnv)
20 | })
21 | 
22 | test_that("drug_normalizer param setting", {
23 |   test_args <- list(
24 |     input_cols = c("string1"),
25 |     output_col = "string1",
26 |     lower_case = TRUE,
27 |     policy = "string1"
28 |   )
29 | 
30 |   test_param_setting(sc, nlp_drug_normalizer, test_args)
31 | })
32 | 
33 | test_that("nlp_drug_normalizer spark_connection", {
34 |   test_annotator <- nlp_drug_normalizer(sc, input_cols = c("document"), output_col = "document_normalized")
35 |   transformed_data <- ml_transform(test_annotator, test_data)
36 |   expect_true("document_normalized" %in% colnames(transformed_data))
37 |   expect_true(inherits(test_annotator, "nlp_drug_normalizer"))
38 | })
39 |   
40 | test_that("nlp_drug_normalizer ml_pipeline", {
41 |   test_annotator <- nlp_drug_normalizer(pipeline, input_cols = c("document"), output_col = "document_normalized")
42 |   transformed_data <- ml_fit_and_transform(test_annotator, test_data)
43 |   expect_true("document_normalized" %in% colnames(transformed_data))
44 | })
45 | 
46 | test_that("nlp_drug_normalizer tbl_spark", {
47 |   transformed_data <- nlp_drug_normalizer(test_data, input_cols = c("document"), output_col = "document_normalized")
48 |   expect_true("document_normalized" %in% colnames(transformed_data))
49 | })
50 | 
51 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-elmo-embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 6 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 7 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 8 | 
 9 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
10 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
11 | 
12 |   assign("sc", sc, envir = parent.frame())
13 |   assign("pipeline", pipeline, envir = parent.frame())
14 |   assign("test_data", test_data, envir = parent.frame())
15 | })
16 | 
17 | teardown({
18 |   spark_disconnect(sc)
19 |   rm(sc, envir = .GlobalEnv)
20 |   rm(pipeline, envir = .GlobalEnv)
21 |   rm(test_data, envir = .GlobalEnv)
22 | })
23 | 
24 | test_that("nlp_elmo_embeddings pretrained", {
25 |   model <- nlp_elmo_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "elmo")
26 |   transformed_data <- ml_transform(model, test_data)
27 |   expect_true("elmo" %in% colnames(transformed_data))
28 | })
29 | 
30 | test_that("nlp_elmo_embeddings load", {
31 |   model_files <- list.files("~/cache_pretrained/")
32 |   elmo_model_file <- max(Filter(function(s) startsWith(s, "elmo_"), model_files))
33 |   model <- ml_load(sc, paste0("~/cache_pretrained/", elmo_model_file))
34 |   transformed_data <- ml_transform(model, test_data)
35 |   expect_true("elmo" %in% colnames(transformed_data))
36 | })
37 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-finisher.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   spark_disconnect(sc)
20 |   rm(sc, envir = .GlobalEnv)
21 |   rm(pipeline, envir = .GlobalEnv)
22 |   rm(test_data, envir = .GlobalEnv)
23 | })
24 | 
25 | test_that("finisher param setting", {
26 |   test_args <- list(
27 |     input_cols = c("token"),
28 |     output_cols = c("finisher_token"),
29 |     clean_annotations = TRUE,
30 |     value_split_symbol = "#",
31 |     annotation_split_symbol = "@",
32 |     include_metadata = TRUE,
33 |     output_as_array = FALSE
34 |   )
35 | 
36 |   test_param_setting(sc, nlp_finisher, test_args)
37 | })
38 | 
39 | test_that("nlp_finisher spark_connection", {
40 |   test_annotator <- nlp_finisher(sc, input_cols = c("token"))
41 |   transformed_data <- ml_transform(test_annotator, test_data)
42 |   expect_true("finished_token" %in% colnames(transformed_data))
43 | })
44 | 
45 | test_that("nlp_finisher ml_pipeline", {
46 |   test_annotator <- nlp_finisher(sc, input_cols = c("token"))
47 |   transformed_data <- ml_transform(test_annotator, test_data)
48 |   expect_true("finished_token" %in% colnames(transformed_data))
49 | })
50 | 
51 | test_that("nlp_finisher tbl_spark", {
52 |   transformed_data <- nlp_finisher(test_data, input_cols = c("token"))
53 |   expect_true("finished_token" %in% colnames(transformed_data))
54 | })
55 | 
56 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-language-detector-dl.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 | 
 8 |   pipeline <- ml_pipeline(assembler)
 9 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
10 | 
11 |   assign("sc", sc, envir = parent.frame())
12 |   assign("pipeline", pipeline, envir = parent.frame())
13 |   assign("test_data", test_data, envir = parent.frame())
14 | })
15 | 
16 | teardown({
17 |   spark_disconnect(sc)
18 |   rm(sc, envir = .GlobalEnv)
19 |   rm(pipeline, envir = .GlobalEnv)
20 |   rm(test_data, envir = .GlobalEnv)
21 | })
22 | 
23 | test_that("nlp_language_detector pretrained", {
24 |   model <- nlp_language_detector_dl_pretrained(sc, input_cols = c("document"), output_col = "language", threshold = 0.2)
25 |   transformed_data <- ml_transform(model, test_data)
26 |   expect_true("language" %in% colnames(transformed_data))
27 |   
28 |   # Test Float parameters
29 |   oldvalue <- ml_param(model, "threshold")
30 |   newmodel <- nlp_set_param(model, "threshold", 0.8)
31 |   newvalue <- ml_param(newmodel, "threshold")
32 |   
33 |   expect_equal(newvalue, 0.8)
34 | })
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-light-pipeline.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 |   
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 |   embeddings <- nlp_word_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "embeddings")
10 |   
11 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer, embeddings)
12 |   fit_pipeline <- ml_fit(pipeline, text_tbl)
13 |   
14 |   assign("sc", sc, envir = parent.frame())
15 |   assign("fit_pipeline", fit_pipeline, envir = parent.frame())
16 |   assign("text_tbl", text_tbl, envir = parent.frame())
17 | })
18 | 
19 | teardown({
20 |   spark_disconnect(sc)
21 |   rm(sc, envir = .GlobalEnv)
22 |   rm(fit_pipeline, envir = .GlobalEnv)
23 |   rm(text_tbl, envir = .GlobalEnv)
24 | })
25 | 
26 | test_that("nlp_light_pipeline data frame annotate", {
27 |   light_pipeline <- nlp_light_pipeline(fit_pipeline)
28 |   result <- nlp_annotate(light_pipeline, text_tbl, "text")
29 |   expect_true("embeddings" %in% colnames(result))
30 | })
31 | 
32 | test_that("nlp_light_pipeline pre-trained", {
33 |   pipeline <- nlp_pretrained_pipeline(sc, "explain_document_ml", lang = "en")
34 |   light_pipeline <- nlp_light_pipeline(pipeline)
35 |   result <- nlp_annotate(light_pipeline, "French author who helped pioneer the science-fiction genre. Verne wrate about space, air, and underwater travel before navigable aircrast and practical submarines were invented, and before any means of space travel had been devised.")
36 |   expect_true("token" %in% names(result))
37 | })
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-longformer-embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   rm(sc, envir = .GlobalEnv)
20 |   rm(pipeline, envir = .GlobalEnv)
21 |   rm(test_data, envir = .GlobalEnv)
22 | })
23 | 
24 | test_that("nlp_longformer_embeddings pretrained", {
25 |   model <- nlp_longformer_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "embeddings")
26 |   transformed_data <- ml_transform(model, test_data)
27 |   expect_true("embeddings" %in% colnames(transformed_data))
28 | })
29 | 
30 | test_that("nlp_longformer_embeddings load", {
31 |   model_files <- list.files("~/cache_pretrained/")
32 |   bert_model_file <- max(Filter(function(s) startsWith(s, "longformer_base"), model_files))
33 |   model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
34 |   transformed_data <- ml_transform(model, test_data)
35 |   expect_true("embeddings" %in% colnames(transformed_data))
36 | })
37 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-longformer-for-token-classification.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   rm(sc, envir = .GlobalEnv)
20 |   rm(pipeline, envir = .GlobalEnv)
21 |   rm(test_data, envir = .GlobalEnv)
22 | })
23 | 
24 | test_that("nlp_longformer_token_classification pretrained", {
25 |   model <- nlp_longformer_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert")
26 |   transformed_data <- ml_transform(model, test_data)
27 |   expect_true("bert" %in% colnames(transformed_data))
28 | })
29 | 
30 | test_that("nlp_longformer_token_classification load", {
31 |   model_files <- list.files("~/cache_pretrained/")
32 |   bert_model_file <- max(Filter(function(s) startsWith(s, "longformer_base_token"), model_files))
33 |   model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
34 |   model <- nlp_set_output_col(model, "label")
35 |   transformed_data <- ml_transform(model, test_data)
36 |   expect_true("label" %in% colnames(transformed_data))
37 | })


--------------------------------------------------------------------------------
/tests/testthat/testthat-ngram-generator.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   spark_disconnect(sc)
20 |   rm(sc, envir = .GlobalEnv)
21 |   rm(pipeline, envir = .GlobalEnv)
22 |   rm(test_data, envir = .GlobalEnv)
23 | })
24 | 
25 | test_that("nlp_ngram_generator param setting", {
26 |   test_args <- list(
27 |     input_cols = c("string1"),
28 |     output_col = "string1",
29 |     n = 2,
30 |     enable_cumulative = TRUE,
31 |     delimiter = "_"
32 |   )
33 |   
34 |   test_param_setting(sc, nlp_ngram_generator, test_args)
35 | })
36 | 
37 | test_that("nlp_ngram_generator spark_connection", {
38 |   test_annotator <- nlp_ngram_generator(sc, input_cols = c("token"), output_col = "ngrams", n = 2)
39 |   transformed_data <- ml_transform(test_annotator, test_data)
40 |   expect_true("ngrams" %in% colnames(transformed_data))
41 | })
42 | 
43 | test_that("nlp_ngram_generator ml_pipeline", {
44 |   test_annotator <- nlp_ngram_generator(pipeline, input_cols = c("token"), output_col = "ngrams", n = 2, enable_cumulative = TRUE)
45 |   transformed_data <- ml_fit_and_transform(test_annotator, test_data)
46 |   expect_true("ngrams" %in% colnames(transformed_data))
47 | })
48 | 
49 | test_that("nlp_ngram_generator tbl_spark", {
50 |   transformed_data <- nlp_ngram_generator(test_data, input_cols = c("token"), output_col = "ngrams")
51 |   expect_true("ngrams" %in% colnames(transformed_data))
52 | })
53 | 
54 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-pretrained-pipeline.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 |   
 5 |   assign("sc", sc, envir = parent.frame())
 6 |   assign("text_tbl", text_tbl, envir = parent.frame())
 7 | })
 8 | 
 9 | teardown({
10 |   spark_disconnect(sc)
11 |   rm(sc, envir = .GlobalEnv)
12 |   rm(text_tbl, envir = .GlobalEnv)
13 | })
14 | 
15 | test_that("nlp_pretrained_pipeline() tbl_spark", {
16 |   result <- nlp_pretrained_pipeline(text_tbl, "recognize_entities_dl")
17 |   expect_true("entities" %in% colnames(result))
18 | })
19 | 
20 | test_that("nlp_pretrained_pipeline() spark_connection", {
21 |   result <- nlp_pretrained_pipeline(sc, "recognize_entities_dl")
22 |   expect_equal(jobj_class(spark_jobj(result)), c("PretrainedPipeline", "Object"))
23 | })
24 | 
25 | test_that("nlp_pretrained_pipeline annotate", {
26 |   pipeline <- nlp_pretrained_pipeline(sc, "recognize_entities_dl")
27 |   annotations <- nlp_annotate(pipeline, text_tbl, column = "text")
28 |   expect_true("entities" %in% colnames(annotations))
29 | })
30 | 
31 | test_that("as_pipeline_model().nlp_pretrained_pipeline", {
32 |   pipeline <- nlp_pretrained_pipeline(sc, "recognize_entities_dl")
33 |   pm <- as_pipeline_model(pipeline)
34 |   expect_s3_class(pm, "ml_pipeline_model")
35 | })
36 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-pubtator.R:
--------------------------------------------------------------------------------
1 | 
2 | test_that("nlp_pubtator_read_dataset", {
3 |   sc <- testthat_spark_connection()
4 |   pubtator <- nlp_pubtator_read_dataset(sc, here::here("tests", "testthat", "data", "corpus_pubtator_sample.txt"))
5 |   expect_true("doc_id" %in% colnames(pubtator))
6 | })


--------------------------------------------------------------------------------
/tests/testthat/testthat-recursive-pipeline.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   #pipeline <- ml_pipeline(assembler, sentdetect, tokenizer, embeddings)
 6 |   #fit_pipeline <- ml_fit(pipeline, text_tbl)
 7 |   
 8 |   assign("sc", sc, envir = parent.frame())
 9 |   #assign("fit_pipeline", fit_pipeline, envir = parent.frame())
10 |   assign("text_tbl", text_tbl, envir = parent.frame())
11 | })
12 | 
13 | teardown({
14 |   spark_disconnect(sc)
15 |   rm(sc, envir = .GlobalEnv)
16 |   #rm(fit_pipeline, envir = .GlobalEnv)
17 |   rm(text_tbl, envir = .GlobalEnv)
18 | })
19 | 
20 | test_that("nlp_recursive_pipeline spark connection", {
21 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
22 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
23 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
24 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
25 |   
26 |   recursive_pipeline <- nlp_recursive_pipeline(sc) %>%
27 |     ml_add_stage(assembler) %>%
28 |     ml_add_stage(sentdetect) %>% 
29 |     ml_add_stage(tokenizer)
30 |   
31 |   result <- ml_fit_and_transform(recursive_pipeline, text_tbl)
32 |   expect_true("token" %in% colnames(result))
33 | })
34 | 
35 | test_that("nlp_recursive_pipeline pipeline stages", {
36 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
37 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
38 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
39 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
40 |   
41 |   recursive_pipeline <- nlp_recursive_pipeline(assembler, sentdetect, tokenizer) 
42 | 
43 |   result <- ml_fit_and_transform(recursive_pipeline, text_tbl)
44 |   expect_true("token" %in% colnames(result))
45 | })


--------------------------------------------------------------------------------
/tests/testthat/testthat-roberta-embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   rm(sc, envir = .GlobalEnv)
20 |   rm(pipeline, envir = .GlobalEnv)
21 |   rm(test_data, envir = .GlobalEnv)
22 | })
23 | 
24 | 
25 | test_that("nlp_roberta_embeddings pretrained", {
26 |   model <- nlp_roberta_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "embeddings")
27 |   transformed_data <- ml_transform(model, test_data)
28 |   expect_true("embeddings" %in% colnames(transformed_data))
29 | })
30 | 
31 | test_that("nlp_distilbert_embeddings load", {
32 |   model_files <- list.files("~/cache_pretrained/")
33 |   bert_model_file <- max(Filter(function(s) startsWith(s, "roberta_base"), model_files))
34 |   model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
35 |   transformed_data <- ml_transform(model, test_data)
36 |   expect_true("embeddings" %in% colnames(transformed_data))
37 | })
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-roberta-for-token-classification.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   rm(sc, envir = .GlobalEnv)
20 |   rm(pipeline, envir = .GlobalEnv)
21 |   rm(test_data, envir = .GlobalEnv)
22 | })
23 | 
24 | test_that("nlp_roberta_token_classification pretrained", {
25 |   model <- nlp_roberta_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert")
26 |   transformed_data <- ml_transform(model, test_data)
27 |   expect_true("bert" %in% colnames(transformed_data))
28 | })
29 | 
30 | test_that("nlp_roberta_token_classification load", {
31 |   model_files <- list.files("~/cache_pretrained/")
32 |   bert_model_file <- max(Filter(function(s) startsWith(s, "roberta_base_token"), model_files))
33 |   model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
34 |   model <- nlp_set_output_col(model, "label")
35 |   transformed_data <- ml_transform(model, test_data)
36 |   expect_true("label" %in% colnames(transformed_data))
37 | })


--------------------------------------------------------------------------------
/tests/testthat/testthat-roberta_sentence_embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 | 
 8 |   pipeline <- ml_pipeline(assembler)
 9 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
10 | 
11 |   assign("sc", sc, envir = parent.frame())
12 |   assign("pipeline", pipeline, envir = parent.frame())
13 |   assign("test_data", test_data, envir = parent.frame())
14 | })
15 | 
16 | teardown({
17 |   spark_disconnect(sc)
18 |   rm(sc, envir = .GlobalEnv)
19 |   rm(pipeline, envir = .GlobalEnv)
20 |   rm(test_data, envir = .GlobalEnv)
21 | })
22 | 
23 | test_that("nlp_roberta_sentence_embeddings pretrained", {
24 |   model <- nlp_roberta_sentence_embeddings_pretrained(sc, input_cols = c("document"), output_col = "roberta_sentence_embeddings")
25 |   transformed_data <- ml_transform(model, test_data)
26 |   expect_true("roberta_sentence_embeddings" %in% colnames(transformed_data))
27 |   
28 |   expect_true(inherits(model, "nlp_roberta_sentence_embeddings"))
29 | })
30 | 
31 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-sentence-detector.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 5 |   pipeline <- ml_pipeline(assembler)
 6 |   document_data <- ml_transform(assembler, text_tbl)
 7 |   
 8 |   assign("sc", sc, envir = parent.frame())
 9 |   assign("pipeline", pipeline, envir = parent.frame())
10 |   assign("document_data", document_data, envir = parent.frame())
11 | })
12 | 
13 | teardown({
14 |   spark_disconnect(sc)
15 |   rm(sc, envir = .GlobalEnv)
16 |   rm(pipeline, envir = .GlobalEnv)
17 |   rm(document_data, envir = .GlobalEnv)
18 | })
19 | 
20 | test_that("nlp_sentence_detector() param setting", {
21 |   test_args <- list(
22 |     input_cols = c("document"), 
23 |     output_col = "sentence",
24 |     custom_bounds = c(":"),
25 |     use_custom_only = FALSE,
26 |     use_abbreviations = TRUE,
27 |     explode_sentences = FALSE,
28 |     detect_lists = TRUE,
29 |     min_length = 20,
30 |     max_length = 400,
31 |     split_length = 250
32 |   )
33 |   test_param_setting(sc, nlp_sentence_detector, test_args)
34 | })
35 | 
36 | test_that("nlp_sentence_detector() spark_connection", {
37 |   detector <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
38 |   transformed_data <- ml_transform(detector, document_data)
39 | 
40 |   expect_true("sentence" %in% colnames(transformed_data))
41 | })
42 | 
43 | test_that("nlp_sentence_detector() ml_pipeline", {
44 |   detector <- nlp_sentence_detector(pipeline, input_cols = c("document"), output_col = "sentence")
45 |   pipeline <- ml_pipeline(detector)
46 | 
47 |   transformed_data <- ml_fit_and_transform(pipeline, document_data)
48 | 
49 |   expect_true("sentence" %in% colnames(transformed_data))
50 | })
51 | 
52 | test_that("nlp_sentence_detector() tbl_spark", {
53 |   transformed_data <- nlp_sentence_detector(document_data, input_cols = c("document"), output_col = "sentence")
54 |   expect_true("document" %in% colnames(transformed_data))
55 | })
56 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-stemmer.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   spark_disconnect(sc)
20 |   rm(sc, envir = .GlobalEnv)
21 |   rm(pipeline, envir = .GlobalEnv)
22 |   rm(test_data, envir = .GlobalEnv)
23 | })
24 | 
25 | test_that("stemmer param setting", {
26 |   test_args <- list(
27 |     input_cols = c("string1"),
28 |     output_col = "string1",
29 |     language = "string1"
30 |   )
31 | 
32 |   test_param_setting(sc, nlp_stemmer, test_args)
33 | })
34 | 
35 | test_that("nlp_stemmer spark_connection", {
36 |   test_annotator <- nlp_stemmer(sc, input_cols = c("token"), output_col = "stem")
37 |   transformed_data <- ml_transform(test_annotator, test_data)
38 |   expect_true("stem" %in% colnames(transformed_data))
39 | })
40 | 
41 | test_that("nlp_stemmer ml_pipeline", {
42 |   test_annotator <- nlp_stemmer(pipeline, input_cols = c("token"), output_col = "stem")
43 |   transformed_data <- ml_fit_and_transform(test_annotator, test_data)
44 |   expect_true("stem" %in% colnames(transformed_data))
45 | })
46 | 
47 | test_that("nlp_stemmer tbl_spark", {
48 |   transformed_data <- nlp_stemmer(test_data, input_cols = c("token"), output_col = "stem")
49 |   expect_true("stem" %in% colnames(transformed_data))
50 | })
51 | 
52 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-token-assembler.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "normalized")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   spark_disconnect(sc)
20 |   rm(sc, envir = .GlobalEnv)
21 |   rm(pipeline, envir = .GlobalEnv)
22 |   rm(test_data, envir = .GlobalEnv)
23 | })
24 | 
25 | test_that("token_assembler param setting", {
26 | # TODO: edit these to make them legal values for the parameters
27 |   test_args <- list(
28 |     input_cols = c("string1", "string2"),
29 |     output_col = "string1"
30 |   )
31 | 
32 |   test_param_setting(sc, nlp_token_assembler, test_args)
33 | })
34 | 
35 | test_that("nlp_token_assembler spark_connection", {
36 |   test_annotator <- nlp_token_assembler(sc, input_cols = c("document", "normalized"), output_col = "assembled")
37 |   transformed_data <- ml_transform(test_annotator, test_data)
38 |   expect_true("assembled" %in% colnames(transformed_data))
39 | })
40 | 
41 | test_that("nlp_token_assembler ml_pipeline", {
42 |   test_annotator <- nlp_token_assembler(pipeline, input_cols = c("document", "normalized"), output_col = "assembled")
43 |   transformed_data <- ml_fit_and_transform(test_annotator, test_data)
44 |   expect_true("assembled" %in% colnames(transformed_data))
45 | })
46 | 
47 | test_that("nlp_token_assembler tbl_spark", {
48 |   transformed_data <- nlp_token_assembler(test_data, input_cols = c("document", "normalized"), output_col = "assembled")
49 |   expect_true("assembled" %in% colnames(transformed_data))
50 | })
51 | 
52 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-xlm-roberta-embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   rm(sc, envir = .GlobalEnv)
20 |   rm(pipeline, envir = .GlobalEnv)
21 |   rm(test_data, envir = .GlobalEnv)
22 | })
23 | 
24 | test_that("nlp_xlm_roberta_embeddings pretrained", {
25 |   model <- nlp_xlm_roberta_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "embeddings",
26 |                                                  name = "xlm_roberta_base")
27 |   transformed_data <- ml_transform(model, test_data)
28 |   expect_true("embeddings" %in% colnames(transformed_data))
29 | })
30 | 
31 | test_that("nlp_xlm_roberta_embeddings load", {
32 |   model_files <- list.files("~/cache_pretrained/")
33 |   bert_model_file <- max(Filter(function(s) startsWith(s, "xlm_roberta_base"), model_files))
34 |   model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
35 |   transformed_data <- ml_transform(model, test_data)
36 |   expect_true("embeddings" %in% colnames(transformed_data))
37 | })
38 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-xlm_roberta-for-token-classification.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   rm(sc, envir = .GlobalEnv)
20 |   rm(pipeline, envir = .GlobalEnv)
21 |   rm(test_data, envir = .GlobalEnv)
22 | })
23 | 
24 | test_that("nlp_xlm_roberta_token_classification pretrained", {
25 |   model <- nlp_xlm_roberta_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert")
26 |   transformed_data <- ml_transform(model, test_data)
27 |   expect_true("bert" %in% colnames(transformed_data))
28 | })
29 | 
30 | test_that("nlp_xlm_roberta_token_classification load", {
31 |   model_files <- list.files("~/cache_pretrained/")
32 |   bert_model_file <- max(Filter(function(s) startsWith(s, "xlm_roberta_base_token"), model_files))
33 |   model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
34 |   model <- nlp_set_output_col(model, "label")
35 |   transformed_data <- ml_transform(model, test_data)
36 |   expect_true("label" %in% colnames(transformed_data))
37 | })


--------------------------------------------------------------------------------
/tests/testthat/testthat-xlm_roberta_sentence_embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 | 
 8 |   pipeline <- ml_pipeline(assembler)
 9 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
10 | 
11 |   assign("sc", sc, envir = parent.frame())
12 |   assign("pipeline", pipeline, envir = parent.frame())
13 |   assign("test_data", test_data, envir = parent.frame())
14 | })
15 | 
16 | teardown({
17 |   spark_disconnect(sc)
18 |   rm(sc, envir = .GlobalEnv)
19 |   rm(pipeline, envir = .GlobalEnv)
20 |   rm(test_data, envir = .GlobalEnv)
21 | })
22 | 
23 | test_that("nlp_bert_sentence_embeddings pretrained", {
24 |   model <- nlp_bert_sentence_embeddings_pretrained(sc, input_cols = c("document"), output_col = "bert_sentence_embeddings")
25 |   transformed_data <- ml_transform(model, test_data)
26 |   expect_true("bert_sentence_embeddings" %in% colnames(transformed_data))
27 |   
28 |   expect_true(inherits(model, "nlp_bert_sentence_embeddings"))
29 | })
30 | 
31 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-xlnet-embeddings.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   # config <- spark_config()
 4 |   # config$`sparklyr.shell.driver-memory` <- "8G"
 5 |   # sc <- spark_connect(master = "local", version = "2.4.3", config = config)
 6 |   text_tbl <- testthat_tbl("test_text")
 7 | 
 8 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 9 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
10 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
11 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
12 | 
13 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
14 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
15 | 
16 |   assign("sc", sc, envir = parent.frame())
17 |   assign("pipeline", pipeline, envir = parent.frame())
18 |   assign("test_data", test_data, envir = parent.frame())
19 | })
20 | 
21 | teardown({
22 |   spark_disconnect(sc)
23 |   rm(sc, envir = .GlobalEnv)
24 |   rm(pipeline, envir = .GlobalEnv)
25 |   rm(test_data, envir = .GlobalEnv)
26 | })
27 | 
28 | test_that("nlp_xlnet_embeddings pretrained", {
29 |   model <- nlp_xlnet_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "xlnet")
30 |   transformed_data <- ml_transform(model, test_data)
31 |   expect_true("xlnet" %in% colnames(transformed_data))
32 | })
33 | 
34 | test_that("nlp_xlnet_embeddings load", {
35 |   model_files <- list.files("~/cache_pretrained/")
36 |   xlnet_model_file <- max(Filter(function(s) startsWith(s, "xlnet_base"), model_files))
37 |   model <- ml_load(sc, paste0("~/cache_pretrained/", xlnet_model_file))
38 |   transformed_data <- ml_transform(model, test_data)
39 |   expect_true("xlnet" %in% colnames(transformed_data))
40 | })
41 | 


--------------------------------------------------------------------------------
/tests/testthat/testthat-xlnet-for-token-classification.R:
--------------------------------------------------------------------------------
 1 | setup({
 2 |   sc <- testthat_spark_connection()
 3 |   text_tbl <- testthat_tbl("test_text")
 4 | 
 5 |   # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
 6 |   assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
 7 |   sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence")
 8 |   tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token")
 9 | 
10 |   pipeline <- ml_pipeline(assembler, sentdetect, tokenizer)
11 |   test_data <- ml_fit_and_transform(pipeline, text_tbl)
12 | 
13 |   assign("sc", sc, envir = parent.frame())
14 |   assign("pipeline", pipeline, envir = parent.frame())
15 |   assign("test_data", test_data, envir = parent.frame())
16 | })
17 | 
18 | teardown({
19 |   rm(sc, envir = .GlobalEnv)
20 |   rm(pipeline, envir = .GlobalEnv)
21 |   rm(test_data, envir = .GlobalEnv)
22 | })
23 | 
24 | test_that("nlp_xlnet_token_classification pretrained", {
25 |   model <- nlp_xlnet_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert")
26 |   transformed_data <- ml_transform(model, test_data)
27 |   expect_true("bert" %in% colnames(transformed_data))
28 | })
29 | 
30 | test_that("nlp_xlnet_token_classification load", {
31 |   model_files <- list.files("~/cache_pretrained/")
32 |   bert_model_file <- max(Filter(function(s) startsWith(s, "xlnet_base_token"), model_files))
33 |   model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
34 |   model <- nlp_set_output_col(model, "label")
35 |   transformed_data <- ml_transform(model, test_data)
36 |   expect_true("label" %in% colnames(transformed_data))
37 | })


--------------------------------------------------------------------------------
/tests/testthat/tf_graphs/RE_in1200D_out20.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/tf_graphs/RE_in1200D_out20.pb


--------------------------------------------------------------------------------
/tests/testthat/tf_graphs/blstm_34_32_30_200_6.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/tf_graphs/blstm_34_32_30_200_6.pb


--------------------------------------------------------------------------------
/tests/testthat/tf_graphs/blstm_5_200_128_67.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/tf_graphs/blstm_5_200_128_67.pb


--------------------------------------------------------------------------------