├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── LICENSE.md ├── NAMESPACE ├── R ├── albert-embeddings.R ├── albert-for-token-classification.R ├── annotate.R ├── annotation.R ├── annotation_tool_json_reader.R ├── assertion_dl.R ├── assertion_filterer.R ├── assertion_logreg.R ├── bert-embeddings.R ├── bert-for-token-classification.R ├── bert_sentence_chunk_embeddings.R ├── bert_sentence_embeddings.R ├── chunk-embeddings.R ├── chunk2doc.R ├── chunk2token.R ├── chunk_entity_resolver.R ├── chunk_filterer.R ├── chunker.R ├── classifier_dl.R ├── context-spell-checker.R ├── contextual_parser.R ├── date-matcher.R ├── date_normalizer.R ├── dependencies.R ├── dependency-parser.R ├── distilbert-embeddings.R ├── distilbert-for-token-classification.R ├── doc2chunk.R ├── document-assembler.R ├── document_logreg_classifier.R ├── document_normalizer.R ├── drug_normalizer.R ├── elmo-embeddings.R ├── embeddings_finisher.R ├── entity_ruler.R ├── finisher.R ├── graph-extraction.R ├── graph-finisher.R ├── java.R ├── language-detector-dl.R ├── lemmatizer.R ├── light-pipeline.R ├── longformer-embeddings.R ├── longformer-for-token-classification.R ├── marian_transformer.R ├── medical-ner.R ├── multi_classifier_dl.R ├── multi_date-matcher.R ├── ner-converter-internal.R ├── ner-converter.R ├── ner-crf.R ├── ner-dl.R ├── ner_chunker.R ├── ngram-generator.R ├── normalizer.R ├── norvig-spell-checker.R ├── perceptron.R ├── pretrained-pipeline.R ├── re_ner_chunks_filter.R ├── recursive-pipeline.R ├── recursive-tokenizer.R ├── regex_matcher.R ├── relation_extraction.R ├── relation_extraction_dl.R ├── resource_downloader.R ├── roberta-embeddings.R ├── roberta-for-token-classification.R ├── roberta_sentence_embeddings.R ├── sentence-detector.R ├── sentence-embeddings.R ├── sentence_detector_dl.R ├── sentence_entity_resolver.R ├── sentiment-detector.R ├── sentiment-dl.R ├── stemmer.R ├── stop_words_cleaner.R ├── symmetric-delete.R ├── t5_transformer.R ├── text-matcher.R ├── token-assembler.R ├── tokenizer.R ├── typed-dependency-parser.R ├── univ_sent_encoder.R ├── utils.R ├── vivekn-sentiment-detector.R ├── word-embeddings.R ├── xlm-roberta-embeddings.R ├── xlm_roberta-for-token-classification.R ├── xlm_roberta_sentence_embeddings.R ├── xlnet-embeddings.R ├── xlnet-for-token-classification.R └── yake_model.R ├── README.md ├── configure.R ├── examples ├── annotation │ ├── Pretrained-MatchPattern-Pipeline.Rmd │ ├── entities.txt │ └── extractor.Rmd ├── quick-start.Rmd ├── training │ ├── Classifier_DL_Train_multi_class_news_category_classifier.Rmd │ ├── Classifier_DL_Train_multi_class_news_category_classifier.nb.html │ ├── ViveknNarayanSentimentApproach.Rmd │ ├── ner_dl.Rmd │ └── ner_dl_crf.Rmd └── tutorials │ ├── 5- How to use Spark NLP and Spark ML Pipelines.Rmd │ └── certification_trainings │ ├── 1.SparkNLP_Basics.Rmd │ ├── 2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.Rmd │ ├── 3.SparkNLP_Pretrained_Models.Rmd │ ├── 4.NERDL_Training.Rmd │ ├── 5.1_Text_classification_examples_in_SparkML_SparkNLP.Rmd │ ├── 5.Text_Classification_with_ClassifierDL.Rmd │ ├── 6.Playground_DataFrames.Rmd │ ├── english_models.png │ ├── nerdl-model.png │ ├── pipeline.png │ ├── pre-trained-pipelines.png │ └── stacked_pipeline.png ├── inst ├── java │ ├── sparknlp-2.4-2.11.jar │ ├── sparknlp-3.0-2.12.jar │ └── sparknlp-3.1-2.12.jar └── sparkml │ ├── class_mapping.json │ └── param_mapping.json ├── java └── main.scala ├── man-roxygen ├── roxlate-input-output-params.R ├── roxlate-input-outputs-params.R ├── roxlate-inputs-output-params.R ├── roxlate-inputs-outputs-params.R ├── roxlate-nlp-algo.R ├── roxlate-nlp-ml-algo.R ├── roxlate-nlp-transformer.R └── roxlate-pretrained-params.R ├── man ├── as_pipeline_model.Rd ├── nlp_albert_embeddings_pretrained.Rd ├── nlp_albert_token_classification_pretrained.Rd ├── nlp_annotate.Rd ├── nlp_annotate_full.Rd ├── nlp_annotation.Rd ├── nlp_annotation_read_dataset.Rd ├── nlp_annotation_tool_json_reader.Rd ├── nlp_assertion_dl.Rd ├── nlp_assertion_dl_pretrained.Rd ├── nlp_assertion_filterer.Rd ├── nlp_assertion_logreg.Rd ├── nlp_assertion_logreg_pretrained.Rd ├── nlp_bert_embeddings_pretrained.Rd ├── nlp_bert_sentence_chunk_embeddings_pretrained.Rd ├── nlp_bert_sentence_embeddings_pretrained.Rd ├── nlp_bert_token_classification_pretrained.Rd ├── nlp_chunk2doc.Rd ├── nlp_chunk2token.Rd ├── nlp_chunk_embeddings.Rd ├── nlp_chunk_entity_resolver.Rd ├── nlp_chunk_entity_resolver_pretrained.Rd ├── nlp_chunk_filterer.Rd ├── nlp_chunker.Rd ├── nlp_classifier_dl.Rd ├── nlp_classifier_dl_pretrained.Rd ├── nlp_conll_read_dataset.Rd ├── nlp_conllu_read_dataset.Rd ├── nlp_context_spell_checker.Rd ├── nlp_context_spell_checker_pretrained.Rd ├── nlp_contextual_parser.Rd ├── nlp_date_matcher.Rd ├── nlp_date_normalizer.Rd ├── nlp_dependency_parser.Rd ├── nlp_dependency_parser_pretrained.Rd ├── nlp_distilbert_embeddings_pretrained.Rd ├── nlp_distilbert_token_classification_pretrained.Rd ├── nlp_doc2chunk.Rd ├── nlp_document_assembler.Rd ├── nlp_document_logreg_classifier.Rd ├── nlp_document_normalizer.Rd ├── nlp_drug_normalizer.Rd ├── nlp_elmo_embeddings_pretrained.Rd ├── nlp_embeddings_finisher.Rd ├── nlp_entity_ruler.Rd ├── nlp_finisher.Rd ├── nlp_generate_assertion_train_set.Rd ├── nlp_generate_colln.Rd ├── nlp_generate_plain_assertion_train_set.Rd ├── nlp_get_classes.Rd ├── nlp_graph_extraction.Rd ├── nlp_graph_finisher.Rd ├── nlp_language_detector_dl_pretrained.Rd ├── nlp_lemmatizer.Rd ├── nlp_lemmatizer_pretrained.Rd ├── nlp_light_pipeline.Rd ├── nlp_longformer_embeddings_pretrained.Rd ├── nlp_longformer_token_classification_pretrained.Rd ├── nlp_marian_transformer.Rd ├── nlp_marian_transformer_pretrained.Rd ├── nlp_medical_ner.Rd ├── nlp_medical_ner_pretrained.Rd ├── nlp_multi_classifier_dl.Rd ├── nlp_multi_classifier_dl_pretrained.Rd ├── nlp_multi_date_matcher.Rd ├── nlp_ner_chunker.Rd ├── nlp_ner_converter.Rd ├── nlp_ner_converter_internal.Rd ├── nlp_ner_crf.Rd ├── nlp_ner_crf_pretrained.Rd ├── nlp_ner_dl.Rd ├── nlp_ner_dl_pretrained.Rd ├── nlp_ngram_generator.Rd ├── nlp_normalizer.Rd ├── nlp_norvig_spell_checker.Rd ├── nlp_norvig_spell_checker_pretrained.Rd ├── nlp_perceptron.Rd ├── nlp_perceptron_pretrained.Rd ├── nlp_pos.Rd ├── nlp_pretrained_pipeline.Rd ├── nlp_pubtator_read_dataset.Rd ├── nlp_re_ner_chunks_filter.Rd ├── nlp_recursive_pipeline.Rd ├── nlp_recursive_tokenizer.Rd ├── nlp_regex_matcher.Rd ├── nlp_relation_extraction.Rd ├── nlp_relation_extraction_dl.Rd ├── nlp_relation_extraction_dl_pretrained.Rd ├── nlp_relation_extraction_pretrained.Rd ├── nlp_resource_downloader.Rd ├── nlp_roberta_embeddings_pretrained.Rd ├── nlp_roberta_sentence_embeddings_pretrained.Rd ├── nlp_roberta_token_classification_pretrained.Rd ├── nlp_sentence_detector.Rd ├── nlp_sentence_detector_dl.Rd ├── nlp_sentence_detector_dl_pretrained.Rd ├── nlp_sentence_embeddings.Rd ├── nlp_sentence_entity_resolver.Rd ├── nlp_sentence_entity_resolver_pretrained.Rd ├── nlp_sentiment_detector.Rd ├── nlp_sentiment_dl.Rd ├── nlp_sentiment_dl_pretrained.Rd ├── nlp_set_input_cols.Rd ├── nlp_set_output_col.Rd ├── nlp_set_param.Rd ├── nlp_set_param_tuple2.Rd ├── nlp_spark_annotation.Rd ├── nlp_stemmer.Rd ├── nlp_stop_words_cleaner.Rd ├── nlp_symmetric_delete.Rd ├── nlp_symmetric_delete_pretrained.Rd ├── nlp_t5_transformer.Rd ├── nlp_t5_transformer_pretrained.Rd ├── nlp_text_matcher.Rd ├── nlp_token_assembler.Rd ├── nlp_tokenizer.Rd ├── nlp_typed_dependency_parser.Rd ├── nlp_typed_dependency_parser_pretrained.Rd ├── nlp_univ_sent_encoder.Rd ├── nlp_univ_sent_encoder_pretrained.Rd ├── nlp_version.Rd ├── nlp_vivekn_sentiment_detector.Rd ├── nlp_vivekn_sentiment_pretrained.Rd ├── nlp_word_embeddings.Rd ├── nlp_word_embeddings_model.Rd ├── nlp_word_embeddings_pretrained.Rd ├── nlp_xlm_roberta_embeddings_pretrained.Rd ├── nlp_xlm_roberta_sentence_embeddings_pretrained.Rd ├── nlp_xlm_roberta_token_classification_pretrained.Rd ├── nlp_xlnet_embeddings_pretrained.Rd ├── nlp_xlnet_token_classification_pretrained.Rd ├── nlp_yake_model.Rd └── set_nlp_version.Rd ├── sparknlp.Rproj └── tests ├── testthat.R └── testthat ├── .gitignore ├── data ├── .gitignore ├── AskAPatient.fold-0.test.txt ├── corpus_pubtator_sample.txt ├── crf-eng.train.small ├── dependency_treebank │ └── wsj_0001.dp ├── e2e.csv ├── en.test.conllu ├── eng.testa.conll ├── entities.txt ├── entity_ruler │ └── patterns.csv ├── gender.csv ├── gender.json ├── i2b2_assertion_sample.csv ├── lemmas_small.txt ├── pos_corpus.txt ├── random_embeddings_dim4.txt ├── re_train.parquet ├── regex_match.txt ├── result.json ├── sentiment.csv ├── sentiment.parquet │ ├── ._SUCCESS.crc │ ├── .part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet.crc │ ├── _SUCCESS │ └── part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet ├── sentiment_dictionary.txt ├── train.conll2009.txt └── words.txt ├── helper-initialize.R ├── testthat-albert-embeddings.R ├── testthat-albert-for-token-classification.R ├── testthat-annotate.R ├── testthat-annotation_tool_json_reader.R ├── testthat-assertion_dl.R ├── testthat-assertion_filterer.R ├── testthat-assertion_logreg.R ├── testthat-bert-embeddings.R ├── testthat-bert-for-token-classification.R ├── testthat-bert_sentence_chunk_embeddings.R ├── testthat-bert_sentence_embeddings.R ├── testthat-chunk-embeddings.R ├── testthat-chunk2doc.R ├── testthat-chunk2token.R ├── testthat-chunk_entity_resolver.R ├── testthat-chunk_filterer.R ├── testthat-chunker.R ├── testthat-classifier_dl.R ├── testthat-context-spell-checker.R ├── testthat-contextual_parser.R ├── testthat-date-matcher.R ├── testthat-date_normalizer.R ├── testthat-dependency-parser.R ├── testthat-distilbert-embeddings.R ├── testthat-distilbert-for-token-classification.R ├── testthat-doc2chunk.R ├── testthat-document-assembler.R ├── testthat-document_logreg_classifier.R ├── testthat-document_normalizer.R ├── testthat-drug_normalizer.R ├── testthat-elmo-embeddings.R ├── testthat-embeddings_finisher.R ├── testthat-entity_ruler.R ├── testthat-finisher.R ├── testthat-graph-extraction.R ├── testthat-graph-finisher.R ├── testthat-language-detector-dl.R ├── testthat-lemmatizer.R ├── testthat-light-pipeline.R ├── testthat-longformer-embeddings.R ├── testthat-longformer-for-token-classification.R ├── testthat-marian_transformer.R ├── testthat-medical-ner.R ├── testthat-multi-date-matcher.R ├── testthat-multi_classifier_dl.R ├── testthat-ner-converter.R ├── testthat-ner-converter_internal.R ├── testthat-ner-crf.R ├── testthat-ner-dl.R ├── testthat-ner_chunker.R ├── testthat-ngram-generator.R ├── testthat-normalizer.R ├── testthat-norvig-spell-checker.R ├── testthat-perceptron.R ├── testthat-pretrained-pipeline.R ├── testthat-pubtator.R ├── testthat-re_ner_chunks_filter.R ├── testthat-recursive-pipeline.R ├── testthat-recursive-tokenizer.R ├── testthat-regex-matcher.R ├── testthat-relation_extraction.R ├── testthat-relation_extraction_dl.R ├── testthat-roberta-embeddings.R ├── testthat-roberta-for-token-classification.R ├── testthat-roberta_sentence_embeddings.R ├── testthat-sentence-detector.R ├── testthat-sentence-embeddings.R ├── testthat-sentence_detector_dl.R ├── testthat-sentence_entity_resolver.R ├── testthat-sentiment-detector.R ├── testthat-sentiment-dl.R ├── testthat-stemmer.R ├── testthat-stop_words_cleaner.R ├── testthat-symmetric-delete.R ├── testthat-t5_transformer.R ├── testthat-text-matcher.R ├── testthat-token-assembler.R ├── testthat-tokenizer.R ├── testthat-typed-dependency-parser.R ├── testthat-univ_sent_encoder.R ├── testthat-utils.R ├── testthat-vivekn-sentiment-detector.R ├── testthat-word-embeddings.R ├── testthat-xlm-roberta-embeddings.R ├── testthat-xlm_roberta-for-token-classification.R ├── testthat-xlm_roberta_sentence_embeddings.R ├── testthat-xlnet-embeddings.R ├── testthat-xlnet-for-token-classification.R ├── testthat-yake_model.R └── tf_graphs ├── RE_in1200D_out20.pb ├── blstm_34_32_30_200_6.pb └── blstm_5_200_128_67.pb /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^internal$ 4 | ^spark-warehouse$ 5 | ^logs$ 6 | ^LICENSE\.md$ 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | internal 6 | logs 7 | spark-warehouse 8 | derby.log 9 | scripts 10 | examples/*.html 11 | *.nb.html 12 | tests/testthat/training_logs/ 13 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: sparknlp 2 | Type: Package 3 | Title: R Interface to John Snow Labs Spark NLP 4 | Version: 0.16.0 5 | Authors@R: c( 6 | person("Dave", "Kincaid", email = "dave@kincaidlabs.ai", role = c("aut")), 7 | person("Kevin", "Kuo", email = "kevin.kuo@rstudio.com", role = c("aut", "cre"), 8 | comment = c(ORCID = "0000-0001-7803-7901")), 9 | person(family = "RStudio", role = c("cph")) 10 | ) 11 | Maintainer: Dave Kincaid 12 | Description: This package implements functions to use John Snow Labs Spark NLP with sparklyr. 13 | License: Apache License (>= 2.0) 14 | BugReports: https://github.com/rstudio/sparknlp/issues 15 | Encoding: UTF-8 16 | LazyData: true 17 | Depends: 18 | R (>= 3.1.2), 19 | sparklyr (>= 1.2.0.9000) 20 | Imports: 21 | forge 22 | Roxygen: list(markdown = TRUE) 23 | RoxygenNote: 7.1.2 24 | Suggests: 25 | testthat (>= 3.0.0) 26 | Config/testthat/edition: 3 27 | -------------------------------------------------------------------------------- /R/java.R: -------------------------------------------------------------------------------- 1 | #' @import forge 2 | read_as <- function(sc, value) { 3 | value <- forge::cast_choice(value, c("TEXT", "BINARY", "SPARK")) 4 | invoke_static(sc, "com.johnsnowlabs.nlp.util.io.ReadAs", value) 5 | } 6 | 7 | # As of Spark NLP 2.3.0 these functions are no longer necessary 8 | # # Function to return default argument values for Scala constructors and static methods. Use "constructor" for the 9 | # # method name if you want default constructor argument values 10 | # default_argument_static <- function(sc, class_name, method_name, arg_num) { 11 | # module <- invoke_static(sc, paste0(class_name, "$"), "MODULE$") 12 | # 13 | # if (method_name == "constructor") { 14 | # method_name = "apply" 15 | # } 16 | # 17 | # default_name <- paste0(method_name, "$default$", arg_num) 18 | # invoke(module, default_name) 19 | # } 20 | # 21 | # # Function to return default argument values for Scala instance methods 22 | # default_argument <- function(x, method_name, arg_num) { 23 | # default_name <- paste0(method_name, "$default$", arg_num) 24 | # invoke(x, default_name) 25 | # } 26 | -------------------------------------------------------------------------------- /R/light-pipeline.R: -------------------------------------------------------------------------------- 1 | #' Spark NLP Light pipeline 2 | #' 3 | #' LightPipelines are Spark ML pipelines converted into a single machine but multithreaded task, becoming more than 4 | #' 10x times faster for smaller amounts of data (small is relative, but 50k sentences is roughly a good maximum). 5 | #' To use them, simply plug in a trained (fitted) pipeline. 6 | #' 7 | #' @param x a trained (fitted) pipeline 8 | #' @param parse_embeddings whether to parse the embeddings 9 | #' 10 | #' @return a LightPipeline object 11 | #' 12 | #' @export 13 | #' 14 | nlp_light_pipeline <- function(x, parse_embeddings = FALSE) { 15 | UseMethod("nlp_light_pipeline", x) 16 | } 17 | 18 | #' @export 19 | nlp_light_pipeline.nlp_pretrained_pipeline <- function(x, parse_embeddings = FALSE) { 20 | new_nlp_light_pipeline(invoke(spark_jobj(x), "lightModel")) 21 | } 22 | 23 | #' @export 24 | nlp_light_pipeline.ml_pipeline_model <- function(x, parse_embeddings = FALSE) { 25 | sc <- spark_connection(x) 26 | jobj <- invoke_new(sc, "com.johnsnowlabs.nlp.LightPipeline", spark_jobj(x), parse_embeddings) 27 | new_nlp_light_pipeline(jobj) 28 | } 29 | 30 | new_nlp_light_pipeline <- function(jobj) { 31 | structure(list(.jobj = jobj), class = c("nlp_light_pipeline", "ml_pipeline_model", "ml_transformer")) 32 | } 33 | 34 | #' @export 35 | spark_jobj.nlp_light_pipeline <- function(x, ...) { 36 | x$.jobj 37 | } -------------------------------------------------------------------------------- /R/resource_downloader.R: -------------------------------------------------------------------------------- 1 | #' SparkNLP ResourceDownloader functions 2 | #' 3 | #' ResourceDownloader provides functions to easily look for pretrained models & pipelines 4 | #' inside Spark NLP. You can filter models or pipelines via language, version, 5 | #' or the name of the annotator 6 | #' 7 | #' @param sc a spark_connect object 8 | #' @param lang language to restrict the results to 9 | #' @param version Spark NLP version to restrict results to 10 | #' 11 | #' @return a markdown table containing the models or pipelines filtered by the provided arguments 12 | #' 13 | #' @name nlp_resource_downloader 14 | #' @aliases ResourceDownloader 15 | NULL 16 | 17 | #' @rdname nlp_resource_downloader 18 | #' @export 19 | nlp_show_public_pipelines <- function(sc, lang = NULL, version = NULL) { 20 | result <- sparklyr::invoke_static(sc, "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader", "showPublicPipelines", lang, version) 21 | return(result) 22 | } 23 | 24 | #' @param annotator name of annotator to restrict results 25 | #' @rdname nlp_resource_downloader 26 | #' @export 27 | nlp_show_public_models <- function(sc, annotator = NULL, lang = NULL, version = NULL) { 28 | result <- sparklyr::invoke_static(sc, "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader", "showPublicModels", annotator, lang, version) 29 | return(result) 30 | } 31 | 32 | #' @param name name of object to clear 33 | #' @param language language to clear 34 | #' @param remote_loc remote_loc of models to clear 35 | #' @rdname nlp_resource_downloader 36 | #' @export 37 | nlp_clear_cache <- function(sc, name = NULL, language = NULL, remote_loc = NULL) { 38 | result <- sparklyr::invoke_static(sc, "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader", "clearCache", name, language, remote_loc) 39 | return(result) 40 | } 41 | 42 | #' @rdname nlp_resource_downloader 43 | #' @export 44 | nlp_show_available_annotators <- function(sc) { 45 | result <- sparklyr::invoke_static(sc, "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader", "showAvailableAnnotators") 46 | return(result) 47 | } 48 | -------------------------------------------------------------------------------- /examples/annotation/Pretrained-MatchPattern-Pipeline.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Use pretrained match_pattern Pipeline" 3 | output: html_notebook 4 | --- 5 | 6 | This notebook is adapted from John Snow Labs Jupyter/Python getting started notebook. See 7 | https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/match-pattern-pipeline/Pretrained-MatchPattern-Pipeline.ipynb 8 | for that version. 9 | 10 | * DocumentAssembler 11 | * SentenceDetector 12 | * Tokenizer 13 | * RegexMatcher (match phone numbers) 14 | 15 | ```{r} 16 | library(sparklyr) 17 | library(sparknlp) 18 | library(dplyr) 19 | ``` 20 | 21 | # Let's create a Spark connection for our app 22 | ```{r} 23 | version <- Sys.getenv("SPARK_VERSION", unset = "2.4.0") 24 | 25 | config <- sparklyr::spark_config() 26 | 27 | options(sparklyr.sanitize.column.names.verbose = TRUE) 28 | options(sparklyr.verbose = TRUE) 29 | options(sparklyr.na.omit.verbose = TRUE) 30 | options(sparklyr.na.action.verbose = TRUE) 31 | sc <- sparklyr::spark_connect(master = "local", version = version, config = config) 32 | ``` 33 | 34 | This Pipeline can extract phone numbers in these formats: 35 | 36 | 0689912549
37 | +33698912549
38 | +33 6 79 91 25 49
39 | +33-6-79-91-25-49
40 | (555)-555-5555
41 | 555-555-5555
42 | +1-238 6 79 91 25 49
43 | +1-555-532-3455
44 | +15555323455
45 | +7 06 79 91 25 49 46 | 47 | ```{r} 48 | pipeline <- nlp_pretrained_pipeline(sc, "match_pattern", lang = "en") 49 | ``` 50 | 51 | ```{r} 52 | result <- nlp_annotate(pipeline, "You should call Mr. Jon Doe at +33 1 79 01 22 89") 53 | ``` 54 | 55 | ```{r} 56 | pull(result, regex)[[1]][[1]][[4]] 57 | ``` 58 | ```{r} 59 | result <- nlp_annotate(pipeline, "Ring me up dude! +1-334-179-1466") 60 | ``` 61 | 62 | ```{r} 63 | pull(result, regex)[[1]][[1]][[4]] 64 | ``` 65 | 66 | 67 | -------------------------------------------------------------------------------- /examples/annotation/entities.txt: -------------------------------------------------------------------------------- 1 | i think 2 | Feeling strangely 3 | guitar lessons -------------------------------------------------------------------------------- /examples/tutorials/certification_trainings/english_models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/examples/tutorials/certification_trainings/english_models.png -------------------------------------------------------------------------------- /examples/tutorials/certification_trainings/nerdl-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/examples/tutorials/certification_trainings/nerdl-model.png -------------------------------------------------------------------------------- /examples/tutorials/certification_trainings/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/examples/tutorials/certification_trainings/pipeline.png -------------------------------------------------------------------------------- /examples/tutorials/certification_trainings/pre-trained-pipelines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/examples/tutorials/certification_trainings/pre-trained-pipelines.png -------------------------------------------------------------------------------- /examples/tutorials/certification_trainings/stacked_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/examples/tutorials/certification_trainings/stacked_pipeline.png -------------------------------------------------------------------------------- /inst/java/sparknlp-2.4-2.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/inst/java/sparknlp-2.4-2.11.jar -------------------------------------------------------------------------------- /inst/java/sparknlp-3.0-2.12.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/inst/java/sparknlp-3.0-2.12.jar -------------------------------------------------------------------------------- /inst/java/sparknlp-3.1-2.12.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/inst/java/sparknlp-3.1-2.12.jar -------------------------------------------------------------------------------- /man-roxygen/roxlate-input-output-params.R: -------------------------------------------------------------------------------- 1 | #' @param input_col Input column. String. 2 | #' @param output_col Output column. String. 3 | -------------------------------------------------------------------------------- /man-roxygen/roxlate-input-outputs-params.R: -------------------------------------------------------------------------------- 1 | #' @param input_col Input column. String. 2 | #' @param output_cols Output columns. String array. -------------------------------------------------------------------------------- /man-roxygen/roxlate-inputs-output-params.R: -------------------------------------------------------------------------------- 1 | #' @param input_cols Input columns. String array. 2 | #' @param output_col Output column. String. 3 | -------------------------------------------------------------------------------- /man-roxygen/roxlate-inputs-outputs-params.R: -------------------------------------------------------------------------------- 1 | #' @param input_cols Input columns. String array. 2 | #' @param output_cols Output columns. String array. 3 | -------------------------------------------------------------------------------- /man-roxygen/roxlate-nlp-algo.R: -------------------------------------------------------------------------------- 1 | #' @param x A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}. 2 | #' @param uid A character string used to uniquely identify the ML estimator. 3 | #' 4 | #' @return The object returned depends on the class of \code{x}. 5 | #' 6 | #' \itemize{ 7 | #' \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 8 | #' a Spark \code{Estimator} object and can be used to compose 9 | #' \code{Pipeline} objects. 10 | #' 11 | #' \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 12 | #' the NLP estimator appended to the pipeline. 13 | #' 14 | #' \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 15 | #' immediately fit with the input \code{tbl_spark}, returning an NLP model. 16 | #' } 17 | -------------------------------------------------------------------------------- /man-roxygen/roxlate-nlp-ml-algo.R: -------------------------------------------------------------------------------- 1 | #' @param x A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}. 2 | #' @param uid A character string used to uniquely identify the ML estimator. 3 | #' 4 | #' @return The object returned depends on the class of \code{x}. 5 | #' 6 | #' \itemize{ 7 | #' \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 8 | #' a Spark \code{Estimator} object and can be used to compose 9 | #' \code{Pipeline} objects. 10 | #' 11 | #' \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 12 | #' a default pretrained NLP model appended to the pipeline. 13 | #' 14 | #' \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 15 | #' immediately fit with the input \code{tbl_spark}, returning an NLP model. 16 | #' } 17 | -------------------------------------------------------------------------------- /man-roxygen/roxlate-nlp-transformer.R: -------------------------------------------------------------------------------- 1 | #' @param x A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}. 2 | #' @param uid A character string used to uniquely identify the ML transformer. 3 | #' 4 | #' @return The object returned depends on the class of \code{x}. 5 | #' 6 | #' \itemize{ 7 | #' \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_transformer} object. The object contains a pointer to 8 | #' a Spark \code{Transformer} object and can be used to compose 9 | #' \code{Pipeline} objects. 10 | #' 11 | #' \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 12 | #' the NLP transformer/annotator appended to the pipeline. 13 | #' 14 | #' \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, a transformer is constructed then 15 | #' immediately fit with the input \code{tbl_spark}, returning the transformed data frame. 16 | #' } 17 | -------------------------------------------------------------------------------- /man-roxygen/roxlate-pretrained-params.R: -------------------------------------------------------------------------------- 1 | #' In most cases you can just leave the parameters NULL (except for the Spark connection) and the Spark NLP defaults 2 | #' will be used. 3 | #' 4 | #' @param sc A Spark connection 5 | #' @param name the name of the model to load. If NULL will use the default value 6 | #' @param lang the language of the model to be loaded. If NULL will use the default value 7 | #' @param remote_loc the remote location of the model. If NULL will use the default value 8 | #' 9 | #' @return The Spark NLP model with the pretrained model loaded 10 | -------------------------------------------------------------------------------- /man/as_pipeline_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pretrained-pipeline.R 3 | \name{as_pipeline_model} 4 | \alias{as_pipeline_model} 5 | \title{Get the PipelineModel from a Spark NLP pretrained pipeline} 6 | \usage{ 7 | as_pipeline_model(pipeline) 8 | } 9 | \arguments{ 10 | \item{pretrained_pipeline}{the Spark NLP PretrainedPipeline object} 11 | } 12 | \value{ 13 | the Spark ML pipeline model from the input 14 | } 15 | \description{ 16 | Spark NLP pretrained pipelines are not Spark ML pipeline models. This function 17 | will retrieve the ML pipeline model from the pretrained pipeline object. 18 | } 19 | -------------------------------------------------------------------------------- /man/nlp_albert_embeddings_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/albert-embeddings.R 3 | \name{nlp_albert_embeddings_pretrained} 4 | \alias{nlp_albert_embeddings_pretrained} 5 | \title{Load a pretrained Spark NLP AlbertEmbeddings model} 6 | \usage{ 7 | nlp_albert_embeddings_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | case_sensitive = NULL, 12 | batch_size = NULL, 13 | dimension = NULL, 14 | lazy_annotator = NULL, 15 | max_sentence_length = NULL, 16 | storage_ref = NULL, 17 | name = NULL, 18 | lang = NULL, 19 | remote_loc = NULL 20 | ) 21 | } 22 | \arguments{ 23 | \item{sc}{A Spark connection} 24 | 25 | \item{input_cols}{Input columns. String array.} 26 | 27 | \item{output_col}{Output column. String.} 28 | 29 | \item{case_sensitive}{whether to treat the tokens as case insensitive when looking up their embedding} 30 | 31 | \item{batch_size}{batch size} 32 | 33 | \item{dimension}{the embedding dimension} 34 | 35 | \item{lazy_annotator}{use as a lazy annotator or not} 36 | 37 | \item{max_sentence_length}{set the maximum sentence length} 38 | 39 | \item{storage_ref}{storage reference name} 40 | 41 | \item{name}{the name of the model to load. If NULL will use the default value} 42 | 43 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 44 | 45 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 46 | } 47 | \value{ 48 | The Spark NLP model with the pretrained model loaded 49 | } 50 | \description{ 51 | Create a pretrained Spark NLP \code{AlbertEmbeddings} model 52 | } 53 | -------------------------------------------------------------------------------- /man/nlp_albert_token_classification_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/albert-for-token-classification.R 3 | \name{nlp_albert_token_classification_pretrained} 4 | \alias{nlp_albert_token_classification_pretrained} 5 | \title{Spark NLP AlbertForTokenClassification} 6 | \usage{ 7 | nlp_albert_token_classification_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | batch_size = NULL, 12 | case_sensitive = NULL, 13 | max_sentence_length = NULL, 14 | name = NULL, 15 | lang = NULL, 16 | remote_loc = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{batch_size}{Size of every batch (Default depends on model).} 25 | 26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)} 27 | 28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)} 29 | 30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 31 | 32 | \item{uid}{A character string used to uniquely identify the ML estimator.} 33 | } 34 | \value{ 35 | The object returned depends on the class of \code{x}. 36 | 37 | \itemize{ 38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 39 | a Spark \code{Estimator} object and can be used to compose 40 | \code{Pipeline} objects. 41 | 42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 43 | the NLP estimator appended to the pipeline. 44 | 45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 46 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 47 | } 48 | } 49 | \description{ 50 | AlbertForTokenClassification can load ALBERT Models with a token classification head on top 51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. 52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#albertfortokenclassification} 53 | } 54 | -------------------------------------------------------------------------------- /man/nlp_annotate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotate.R 3 | \name{nlp_annotate} 4 | \alias{nlp_annotate} 5 | \title{Annotate some text} 6 | \usage{ 7 | nlp_annotate(x, target, column = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{some SparkNLP object that has an annotate method that takes a Spark data frame as argument} 11 | 12 | \item{target}{the text to annotate. This can be a character string, a character vector or a data frame (with the text 13 | in a field named "text")} 14 | 15 | \item{column}{the column name containing text if a Spark DataFrame is passed in.} 16 | } 17 | \value{ 18 | If given a character vector the return value is a list of lists containing the annotations. 19 | 20 | If given a Spark DataFrame the return value is a Spark data frame containing the annotations 21 | } 22 | \description{ 23 | Use SparkNLP to annotate some text. 24 | } 25 | -------------------------------------------------------------------------------- /man/nlp_annotate_full.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotate.R 3 | \name{nlp_annotate_full} 4 | \alias{nlp_annotate_full} 5 | \title{Fully annotate some text} 6 | \usage{ 7 | nlp_annotate_full(x, target, column = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{some SparkNLP object that has an annotate method that takes a Spark data frame as argument} 11 | 12 | \item{target}{the text to annotate. This can be a character string, a character vector or a data frame (with the text 13 | in a field named "text")} 14 | 15 | \item{column}{the column name containing text if a Spark DataFrame is passed in.} 16 | } 17 | \value{ 18 | If given a character vector the return value is a list of lists containing the annotations. 19 | 20 | If given a Spark DataFrame the return value is a Spark data frame containing the annotations 21 | } 22 | \description{ 23 | Use Spark NLP to fully annotate some text. 24 | } 25 | -------------------------------------------------------------------------------- /man/nlp_annotation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotation.R 3 | \name{nlp_annotation} 4 | \alias{nlp_annotation} 5 | \title{Spark NLP S3 Annotation object} 6 | \usage{ 7 | nlp_annotation(x) 8 | } 9 | \arguments{ 10 | \item{x}{a spark_jobj that is an Annotation object or a named list} 11 | } 12 | \value{ 13 | a local nlp_annotation object 14 | } 15 | \description{ 16 | A Spark NLP annotation S3 object has the following fields: 17 | \itemize{ 18 | \item annotatorType: the type of annotation (String) 19 | \item begin: the index of the first character under this annotation (integer) 20 | \item end: the index after the last character under this annotation (integer) 21 | \item metadata: associated metadata for this annotation (Map(String, String)) 22 | \item result: the main output of the annotation (String) 23 | \item embeddings: vector of embeddings (Array(Float)) 24 | } 25 | } 26 | \details{ 27 | See \url{https://nlp.johnsnowlabs.com/docs/en/concepts#annotation} 28 | } 29 | -------------------------------------------------------------------------------- /man/nlp_annotation_read_dataset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotation_tool_json_reader.R 3 | \name{nlp_annotation_read_dataset} 4 | \alias{nlp_annotation_read_dataset} 5 | \title{Create a data frame from an AnnotationToolJsonReader} 6 | \usage{ 7 | nlp_annotation_read_dataset(reader, json_path) 8 | } 9 | \arguments{ 10 | \item{reader}{an instance of AnnotationToolJsonReader \code{\link{nlp_annotation_tool_json_reader}}} 11 | 12 | \item{json_path}{path to the json from annotation lab export} 13 | } 14 | \value{ 15 | assertion train set data frame 16 | } 17 | \description{ 18 | Create a data frame from an AnnotationToolJsonReader 19 | } 20 | -------------------------------------------------------------------------------- /man/nlp_annotation_tool_json_reader.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotation_tool_json_reader.R 3 | \name{nlp_annotation_tool_json_reader} 4 | \alias{nlp_annotation_tool_json_reader} 5 | \title{Spark NLP AnnotationToolJsonReader} 6 | \usage{ 7 | nlp_annotation_tool_json_reader( 8 | sc, 9 | assertion_labels = list(), 10 | excluded_labels = list(), 11 | cleanup_mode = "disabled", 12 | split_chars = list(), 13 | context_chars = list(), 14 | scheme = "IOB", 15 | min_chars_tol = 2L, 16 | align_chars_tol = 1L, 17 | merge_overlapping = TRUE, 18 | sddl_path = "" 19 | ) 20 | } 21 | \arguments{ 22 | \item{assertion_labels}{list of strings} 23 | 24 | \item{excluded_labels}{list of strings} 25 | 26 | \item{cleanup_mode}{string (Default: disabled)} 27 | 28 | \item{split_chars}{list of strings} 29 | 30 | \item{context_chars}{list of strings} 31 | 32 | \item{scheme}{string (Default: "IOB")} 33 | 34 | \item{min_chars_tol}{integer (Default: 2)} 35 | 36 | \item{align_chars_tol}{integer (Default: 1)} 37 | 38 | \item{merge_overlapping}{boolean (Default: true)} 39 | 40 | \item{sddl_path}{string (Default: "")} 41 | } 42 | \value{ 43 | assertion train set 44 | } 45 | \description{ 46 | The annotation tool json reader is a reader that generate a assertion train set from the json from annotations labs exports. 47 | } 48 | -------------------------------------------------------------------------------- /man/nlp_assertion_dl_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/assertion_dl.R 3 | \name{nlp_assertion_dl_pretrained} 4 | \alias{nlp_assertion_dl_pretrained} 5 | \title{Load a pretrained Spark NLP Assertion DL model} 6 | \usage{ 7 | nlp_assertion_dl_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | batch_size = NULL, 12 | scope_window = NULL, 13 | max_sent_len = NULL, 14 | storage_ref = NULL, 15 | name = NULL, 16 | lang = NULL, 17 | remote_loc = NULL 18 | ) 19 | } 20 | \arguments{ 21 | \item{sc}{A Spark connection} 22 | 23 | \item{input_cols}{Input columns. String array.} 24 | 25 | \item{output_col}{Output column. String.} 26 | 27 | \item{batch_size}{Parameter, which regulates the size of the batch} 28 | 29 | \item{scope_window}{The scope window of the assertion (whole sentence by default)} 30 | 31 | \item{max_sent_len}{Parameter, which regulates the length of the longest sentence} 32 | 33 | \item{storage_ref}{storage reference for embeddings} 34 | 35 | \item{name}{the name of the model to load. If NULL will use the default value} 36 | 37 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 38 | 39 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 40 | } 41 | \value{ 42 | The Spark NLP model with the pretrained model loaded 43 | } 44 | \description{ 45 | Create a pretrained Spark NLP \code{AssertionDLModel} model 46 | } 47 | -------------------------------------------------------------------------------- /man/nlp_assertion_filterer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/assertion_filterer.R 3 | \name{nlp_assertion_filterer} 4 | \alias{nlp_assertion_filterer} 5 | \title{Spark NLP AssertionFilterer} 6 | \usage{ 7 | nlp_assertion_filterer( 8 | x, 9 | input_cols, 10 | output_col, 11 | criteria = NULL, 12 | whitelist = NULL, 13 | regex = NULL, 14 | uid = random_string("assertion_filterer_") 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 19 | 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{criteria}{isin or regex} 25 | 26 | \item{whitelist}{If defined, list of entities to process.} 27 | 28 | \item{regex}{If defined, list of entities to process.} 29 | 30 | \item{uid}{A character string used to uniquely identify the ML estimator.} 31 | } 32 | \value{ 33 | The object returned depends on the class of \code{x}. 34 | 35 | \itemize{ 36 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 37 | a Spark \code{Estimator} object and can be used to compose 38 | \code{Pipeline} objects. 39 | 40 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 41 | the NLP estimator appended to the pipeline. 42 | 43 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 44 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 45 | } 46 | } 47 | \description{ 48 | Spark ML transformer that will allow you to filter out the named entities by 49 | the list of acceptable assertion statuses. This annotator would be quite handy 50 | if you want to set a white list for the acceptable assertion statuses like 51 | present or conditional; and do not want absent conditions get out of your pipeline. 52 | See \url{https://nlp.johnsnowlabs.com/docs/en/licensed_release_notes#3-assertionfilterer} 53 | } 54 | -------------------------------------------------------------------------------- /man/nlp_assertion_logreg_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/assertion_logreg.R 3 | \name{nlp_assertion_logreg_pretrained} 4 | \alias{nlp_assertion_logreg_pretrained} 5 | \title{Load a pretrained Spark NLP Assertion LogReg model} 6 | \usage{ 7 | nlp_assertion_logreg_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | before = NULL, 12 | after = NULL, 13 | start_col = NULL, 14 | end_col = NULL, 15 | lazy_annotator = NULL, 16 | storage_ref = NULL, 17 | name = NULL, 18 | lang = NULL, 19 | remote_loc = NULL 20 | ) 21 | } 22 | \arguments{ 23 | \item{sc}{A Spark connection} 24 | 25 | \item{input_cols}{Input columns. String array.} 26 | 27 | \item{output_col}{Output column. String.} 28 | 29 | \item{before}{Amount of tokens from the context before the target} 30 | 31 | \item{after}{Amount of tokens from the context after the target} 32 | 33 | \item{start_col}{Column that contains the token number for the start of the target} 34 | 35 | \item{end_col}{Column that contains the token number for the end of the target} 36 | 37 | \item{lazy_annotator}{a Param in Annotators that allows them to stand idle in the Pipeline and do nothing. Can be called by other Annotators in a RecursivePipeline} 38 | 39 | \item{storage_ref}{storage reference for embeddings} 40 | 41 | \item{name}{the name of the model to load. If NULL will use the default value} 42 | 43 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 44 | 45 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 46 | } 47 | \value{ 48 | The Spark NLP model with the pretrained model loaded 49 | } 50 | \description{ 51 | Create a pretrained Spark NLP \code{AssertionLogRegModel} model 52 | } 53 | -------------------------------------------------------------------------------- /man/nlp_bert_embeddings_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bert-embeddings.R 3 | \name{nlp_bert_embeddings_pretrained} 4 | \alias{nlp_bert_embeddings_pretrained} 5 | \title{Load a pretrained Spark NLP BertEmbeddings model} 6 | \usage{ 7 | nlp_bert_embeddings_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | case_sensitive = NULL, 12 | batch_size = NULL, 13 | dimension = NULL, 14 | lazy_annotator = NULL, 15 | max_sentence_length = NULL, 16 | storage_ref = NULL, 17 | name = NULL, 18 | lang = NULL, 19 | remote_loc = NULL 20 | ) 21 | } 22 | \arguments{ 23 | \item{sc}{A Spark connection} 24 | 25 | \item{input_cols}{Input columns. String array.} 26 | 27 | \item{output_col}{Output column. String.} 28 | 29 | \item{case_sensitive}{whether to treat the tokens as case insensitive when looking up their embedding} 30 | 31 | \item{batch_size}{batch size} 32 | 33 | \item{dimension}{the embedding dimension} 34 | 35 | \item{lazy_annotator}{use as a lazy annotator or not} 36 | 37 | \item{max_sentence_length}{set the maximum sentence length} 38 | 39 | \item{storage_ref}{storage reference name} 40 | 41 | \item{name}{the name of the model to load. If NULL will use the default value} 42 | 43 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 44 | 45 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 46 | } 47 | \value{ 48 | The Spark NLP model with the pretrained model loaded 49 | } 50 | \description{ 51 | Create a pretrained Spark NLP \code{BertEmbeddings} model 52 | } 53 | -------------------------------------------------------------------------------- /man/nlp_bert_sentence_embeddings_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bert_sentence_embeddings.R 3 | \name{nlp_bert_sentence_embeddings_pretrained} 4 | \alias{nlp_bert_sentence_embeddings_pretrained} 5 | \title{Load a pretrained Spark NLP BertSentenceEmbeddings model} 6 | \usage{ 7 | nlp_bert_sentence_embeddings_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | case_sensitive = NULL, 12 | batch_size = NULL, 13 | dimension = NULL, 14 | max_sentence_length = NULL, 15 | name = NULL, 16 | lang = NULL, 17 | remote_loc = NULL 18 | ) 19 | } 20 | \arguments{ 21 | \item{sc}{A Spark connection} 22 | 23 | \item{input_cols}{Input columns. String array.} 24 | 25 | \item{output_col}{Output column. String.} 26 | 27 | \item{case_sensitive}{whether to lowercase tokens or not} 28 | 29 | \item{batch_size}{batch size} 30 | 31 | \item{dimension}{defines the output layer of BERT when calculating embeddings} 32 | 33 | \item{max_sentence_length}{max sentence length to process} 34 | 35 | \item{name}{the name of the model to load. If NULL will use the default value} 36 | 37 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 38 | 39 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 40 | } 41 | \value{ 42 | The Spark NLP model with the pretrained model loaded 43 | } 44 | \description{ 45 | Create a pretrained Spark NLP \code{BertSentenceEmbeddings} model. 46 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#bertsentenceembeddings} 47 | } 48 | -------------------------------------------------------------------------------- /man/nlp_bert_token_classification_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bert-for-token-classification.R 3 | \name{nlp_bert_token_classification_pretrained} 4 | \alias{nlp_bert_token_classification_pretrained} 5 | \title{Spark NLP BertForTokenClassification} 6 | \usage{ 7 | nlp_bert_token_classification_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | batch_size = NULL, 12 | case_sensitive = NULL, 13 | max_sentence_length = NULL, 14 | name = NULL, 15 | lang = NULL, 16 | remote_loc = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{batch_size}{Size of every batch (Default depends on model).} 25 | 26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)} 27 | 28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)} 29 | 30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 31 | 32 | \item{uid}{A character string used to uniquely identify the ML estimator.} 33 | } 34 | \value{ 35 | The object returned depends on the class of \code{x}. 36 | 37 | \itemize{ 38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 39 | a Spark \code{Estimator} object and can be used to compose 40 | \code{Pipeline} objects. 41 | 42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 43 | the NLP estimator appended to the pipeline. 44 | 45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 46 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 47 | } 48 | } 49 | \description{ 50 | BertForTokenClassification can load Bert Models with a token classification head on top 51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. 52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#bertfortokenclassification} 53 | } 54 | -------------------------------------------------------------------------------- /man/nlp_chunk2doc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chunk2doc.R 3 | \name{nlp_chunk2doc} 4 | \alias{nlp_chunk2doc} 5 | \title{Spark NLP Chunk2Doc} 6 | \usage{ 7 | nlp_chunk2doc(x, input_cols, output_col, uid = random_string("chunk2doc_")) 8 | } 9 | \arguments{ 10 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 11 | 12 | \item{input_cols}{Input columns. String array.} 13 | 14 | \item{output_col}{Output column. String.} 15 | 16 | \item{uid}{A character string used to uniquely identify the ML estimator.} 17 | } 18 | \value{ 19 | The object returned depends on the class of \code{x}. 20 | 21 | \itemize{ 22 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 23 | a Spark \code{Estimator} object and can be used to compose 24 | \code{Pipeline} objects. 25 | 26 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 27 | the NLP estimator appended to the pipeline. 28 | 29 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 30 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 31 | } 32 | } 33 | \description{ 34 | Spark ML transformer that Converts a CHUNK type column back into DOCUMENT. Useful when trying to re-tokenize or do 35 | further analysis on a CHUNK result. 36 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#chunk2doc} 37 | } 38 | -------------------------------------------------------------------------------- /man/nlp_chunk2token.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chunk2token.R 3 | \name{nlp_chunk2token} 4 | \alias{nlp_chunk2token} 5 | \title{Spark NLP Chunk2Token} 6 | \usage{ 7 | nlp_chunk2token(x, input_cols, output_col, uid = random_string("chunk2token_")) 8 | } 9 | \arguments{ 10 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 11 | 12 | \item{input_cols}{Input columns. String array.} 13 | 14 | \item{output_col}{Output column. String.} 15 | 16 | \item{uid}{A character string used to uniquely identify the ML estimator.} 17 | } 18 | \value{ 19 | The object returned depends on the class of \code{x}. 20 | 21 | \itemize{ 22 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 23 | a Spark \code{Estimator} object and can be used to compose 24 | \code{Pipeline} objects. 25 | 26 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 27 | the NLP estimator appended to the pipeline. 28 | 29 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 30 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 31 | } 32 | } 33 | \description{ 34 | Spark ML transformer that 35 | See \url{https://nlp.johnsnowlabs.com/docs/en/licensed_annotators#chunk2token} 36 | } 37 | -------------------------------------------------------------------------------- /man/nlp_chunk_embeddings.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chunk-embeddings.R 3 | \name{nlp_chunk_embeddings} 4 | \alias{nlp_chunk_embeddings} 5 | \title{Spark NLP ChunkEmbeddings} 6 | \usage{ 7 | nlp_chunk_embeddings( 8 | x, 9 | input_cols, 10 | output_col, 11 | pooling_strategy = NULL, 12 | uid = random_string("chunk_embeddings_") 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 17 | 18 | \item{input_cols}{Input columns. String array.} 19 | 20 | \item{output_col}{Output column. String.} 21 | 22 | \item{pooling_strategy}{Choose how you would like to aggregate Word Embeddings to Sentence Embeddings: AVERAGE or SUM} 23 | 24 | \item{uid}{A character string used to uniquely identify the ML estimator.} 25 | } 26 | \value{ 27 | The object returned depends on the class of \code{x}. 28 | 29 | \itemize{ 30 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 31 | a Spark \code{Estimator} object and can be used to compose 32 | \code{Pipeline} objects. 33 | 34 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 35 | the NLP estimator appended to the pipeline. 36 | 37 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 38 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 39 | } 40 | } 41 | \description{ 42 | Spark ML transformer that utilizes WordEmbeddings or BertEmbeddings to generate chunk embeddings from either Chunker, 43 | NGramGenerator, or NerConverter outputs. 44 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#chunkembeddings} 45 | } 46 | -------------------------------------------------------------------------------- /man/nlp_chunk_filterer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chunk_filterer.R 3 | \name{nlp_chunk_filterer} 4 | \alias{nlp_chunk_filterer} 5 | \title{Spark NLP ChunkFilterer} 6 | \usage{ 7 | nlp_chunk_filterer( 8 | x, 9 | input_cols, 10 | output_col, 11 | criteria = NULL, 12 | whitelist = NULL, 13 | regex = NULL, 14 | uid = random_string("chunk_filterer_") 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 19 | 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{criteria}{isin or regex} 25 | 26 | \item{whitelist}{If defined, list of entities to process.} 27 | 28 | \item{regex}{If defined, list of entities to process.} 29 | 30 | \item{uid}{A character string used to uniquely identify the ML estimator.} 31 | } 32 | \value{ 33 | The object returned depends on the class of \code{x}. 34 | 35 | \itemize{ 36 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 37 | a Spark \code{Estimator} object and can be used to compose 38 | \code{Pipeline} objects. 39 | 40 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 41 | the NLP estimator appended to the pipeline. 42 | 43 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 44 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 45 | } 46 | } 47 | \description{ 48 | Spark ML transformer that will filter out named entities by some conditions 49 | or predefined look-up lists, so that you can feed these entities to other 50 | annotators like Assertion Status or Entity Resolvers. It can be used with 51 | two criteria: isin and regex. 52 | See \url{https://nlp.johnsnowlabs.com/docs/en/licensed_release_notes#2-chunkfilterer} 53 | } 54 | -------------------------------------------------------------------------------- /man/nlp_chunker.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chunker.R 3 | \name{nlp_chunker} 4 | \alias{nlp_chunker} 5 | \title{Spark NLP Chunker - Meaningful phrase matching} 6 | \usage{ 7 | nlp_chunker( 8 | x, 9 | input_cols, 10 | output_col, 11 | regex_parsers = NULL, 12 | uid = random_string("chunker_") 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 17 | 18 | \item{input_cols}{Input columns. String array.} 19 | 20 | \item{output_col}{Output column. String.} 21 | 22 | \item{regex_parsers}{the regular expression parsers to use for the chunking} 23 | 24 | \item{uid}{A character string used to uniquely identify the ML estimator.} 25 | } 26 | \value{ 27 | The object returned depends on the class of \code{x}. 28 | 29 | \itemize{ 30 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 31 | a Spark \code{Estimator} object and can be used to compose 32 | \code{Pipeline} objects. 33 | 34 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 35 | the NLP estimator appended to the pipeline. 36 | 37 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 38 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 39 | } 40 | } 41 | \description{ 42 | Spark ML transformer that matches a pattern of part-of-speech tags in order to return meaningful phrases from document 43 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#chunker} 44 | } 45 | -------------------------------------------------------------------------------- /man/nlp_classifier_dl_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/classifier_dl.R 3 | \name{nlp_classifier_dl_pretrained} 4 | \alias{nlp_classifier_dl_pretrained} 5 | \title{Load a pretrained Spark NLP Classifier DL model} 6 | \usage{ 7 | nlp_classifier_dl_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | include_confidence = NULL, 12 | name = NULL, 13 | lang = NULL, 14 | remote_loc = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{sc}{A Spark connection} 19 | 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{include_confidence}{whether to include the confidence scores in the predictions} 25 | 26 | \item{name}{the name of the model to load. If NULL will use the default value} 27 | 28 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 29 | 30 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 31 | } 32 | \value{ 33 | The Spark NLP model with the pretrained model loaded 34 | } 35 | \description{ 36 | Create a pretrained Spark NLP \code{ClassifierDLModel} model 37 | } 38 | -------------------------------------------------------------------------------- /man/nlp_conllu_read_dataset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{nlp_conllu_read_dataset} 4 | \alias{nlp_conllu_read_dataset} 5 | \title{Transform CoNLLU format text file to Spark dataframe} 6 | \usage{ 7 | nlp_conllu_read_dataset(sc, path, read_as = NULL, explode_sentences = NULL) 8 | } 9 | \arguments{ 10 | \item{sc}{a Spark connection} 11 | 12 | \item{path}{path to the file to read} 13 | 14 | \item{read_as}{Can be LINE_BY_LINE or SPARK_DATASET, with options if latter is used (default LINE_BY_LINE)} 15 | } 16 | \description{ 17 | In order to train a Lemmatizer annotator, we need to get CoNLLU format data as a spark dataframe. 18 | There is a component that does this for us: it reads a plain text file and transforms it to a spark dataset. 19 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#conllu-dataset}. All the function arguments have defaults. 20 | See \url{https://nlp.johnsnowlabs.com/api/index.html#com.johnsnowlabs.nlp.training.CoNLLU} for the defaults. 21 | } 22 | -------------------------------------------------------------------------------- /man/nlp_context_spell_checker_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/context-spell-checker.R 3 | \name{nlp_context_spell_checker_pretrained} 4 | \alias{nlp_context_spell_checker_pretrained} 5 | \title{Load a pretrained Spark NLP ContextSpellChecker model} 6 | \usage{ 7 | nlp_context_spell_checker_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Create a pretrained Spark NLP \code{ContextSpellChecker} model 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_dependency_parser_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dependency-parser.R 3 | \name{nlp_dependency_parser_pretrained} 4 | \alias{nlp_dependency_parser_pretrained} 5 | \title{Load a pretrained Spark NLP model} 6 | \usage{ 7 | nlp_dependency_parser_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Create a pretrained Spark NLP \code{DependencyParserModel} model 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_distilbert_token_classification_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/distilbert-for-token-classification.R 3 | \name{nlp_distilbert_token_classification_pretrained} 4 | \alias{nlp_distilbert_token_classification_pretrained} 5 | \title{Spark NLP DistilBertForTokenClassification} 6 | \usage{ 7 | nlp_distilbert_token_classification_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | batch_size = NULL, 12 | case_sensitive = NULL, 13 | max_sentence_length = NULL, 14 | name = NULL, 15 | lang = NULL, 16 | remote_loc = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{batch_size}{Size of every batch (Default depends on model).} 25 | 26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)} 27 | 28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)} 29 | 30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 31 | 32 | \item{uid}{A character string used to uniquely identify the ML estimator.} 33 | } 34 | \value{ 35 | The object returned depends on the class of \code{x}. 36 | 37 | \itemize{ 38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 39 | a Spark \code{Estimator} object and can be used to compose 40 | \code{Pipeline} objects. 41 | 42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 43 | the NLP estimator appended to the pipeline. 44 | 45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 46 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 47 | } 48 | } 49 | \description{ 50 | DistilBertForTokenClassification can load Bert Models with a token classification head on top 51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. 52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#distilbertfortokenclassification} 53 | } 54 | -------------------------------------------------------------------------------- /man/nlp_drug_normalizer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/drug_normalizer.R 3 | \name{nlp_drug_normalizer} 4 | \alias{nlp_drug_normalizer} 5 | \title{Spark NLP DrugNormalizer} 6 | \usage{ 7 | nlp_drug_normalizer( 8 | x, 9 | input_cols, 10 | output_col, 11 | lower_case = NULL, 12 | policy = NULL, 13 | uid = random_string("drug_normalizer_") 14 | ) 15 | } 16 | \arguments{ 17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{lower_case}{whether to convert strings to lowercase} 24 | 25 | \item{policy}{removalPolicy to remove patterns from text with a given policy} 26 | 27 | \item{uid}{A character string used to uniquely identify the ML estimator.} 28 | } 29 | \value{ 30 | The object returned depends on the class of \code{x}. 31 | 32 | \itemize{ 33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 34 | a Spark \code{Estimator} object and can be used to compose 35 | \code{Pipeline} objects. 36 | 37 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 38 | the NLP estimator appended to the pipeline. 39 | 40 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 41 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 42 | } 43 | } 44 | \description{ 45 | Spark ML transformer that normalizes raw text from clinical documents, e.g. 46 | scraped web pages or xml documents, from document type columns into Sentence. 47 | Removes all dirty characters from text following one or more input regex 48 | patterns. Can apply non wanted character removal which a specific policy. 49 | Can apply lower case normalization. 50 | See \url{https://nlp.johnsnowlabs.com/licensed/api/index.html#com.johnsnowlabs.nlp.annotators.DrugNormalizer} 51 | } 52 | -------------------------------------------------------------------------------- /man/nlp_elmo_embeddings_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/elmo-embeddings.R 3 | \name{nlp_elmo_embeddings_pretrained} 4 | \alias{nlp_elmo_embeddings_pretrained} 5 | \title{Load a pretrained Spark NLP ElmoEmbeddings model} 6 | \usage{ 7 | nlp_elmo_embeddings_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | case_sensitive = NULL, 12 | batch_size = NULL, 13 | dimension = NULL, 14 | pooling_layer = NULL, 15 | name = NULL, 16 | lang = NULL, 17 | remote_loc = NULL 18 | ) 19 | } 20 | \arguments{ 21 | \item{sc}{A Spark connection} 22 | 23 | \item{input_cols}{Input columns. String array.} 24 | 25 | \item{output_col}{Output column. String.} 26 | 27 | \item{case_sensitive}{whether to treat the tokens as case insensitive when looking up their embedding} 28 | 29 | \item{batch_size}{batch size} 30 | 31 | \item{dimension}{the embedding dimension} 32 | 33 | \item{pooling_layer}{word_emb, lstm_outputs1, lstm_outputs2 or elmo} 34 | 35 | \item{name}{the name of the model to load. If NULL will use the default value} 36 | 37 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 38 | 39 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 40 | } 41 | \value{ 42 | The Spark NLP model with the pretrained model loaded 43 | } 44 | \description{ 45 | Create a pretrained Spark NLP \code{ElmoEmbeddings} model 46 | } 47 | -------------------------------------------------------------------------------- /man/nlp_embeddings_finisher.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/embeddings_finisher.R 3 | \name{nlp_embeddings_finisher} 4 | \alias{nlp_embeddings_finisher} 5 | \title{Spark NLP EmbeddingsFinisher} 6 | \usage{ 7 | nlp_embeddings_finisher( 8 | x, 9 | input_cols, 10 | output_cols, 11 | clean_annotations = NULL, 12 | output_as_vector = NULL, 13 | uid = random_string("embeddings_finisher_") 14 | ) 15 | } 16 | \arguments{ 17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_cols}{Output columns. String array.} 22 | 23 | \item{clean_annotations}{Whether to remove and cleanup the rest of the annotators (columns)} 24 | 25 | \item{output_as_vector}{if enabled, it will output the embeddings as Vectors instead of arrays} 26 | 27 | \item{uid}{A character string used to uniquely identify the ML estimator.} 28 | } 29 | \value{ 30 | The object returned depends on the class of \code{x}. 31 | 32 | \itemize{ 33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 34 | a Spark \code{Estimator} object and can be used to compose 35 | \code{Pipeline} objects. 36 | 37 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 38 | the NLP estimator appended to the pipeline. 39 | 40 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 41 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 42 | } 43 | } 44 | \description{ 45 | Spark ML transformer that is designed to deal with embedding annotators: WordEmbeddings, BertEmbeddings, 46 | SentenceEmbeddingd, and ChunkEmbeddings. By using EmbeddingsFinisher you can easily transform your embeddings 47 | into array of floats or Vectors which are compatible with Spark ML functions such as LDA, K-mean, Random Forest 48 | classifier or any other functions that require featureCol. 49 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#embeddingsfinisher} 50 | } 51 | -------------------------------------------------------------------------------- /man/nlp_finisher.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/finisher.R 3 | \name{nlp_finisher} 4 | \alias{nlp_finisher} 5 | \title{Spark NLP Finisher} 6 | \usage{ 7 | nlp_finisher( 8 | x, 9 | input_cols, 10 | output_cols = NULL, 11 | clean_annotations = NULL, 12 | value_split_symbol = NULL, 13 | annotation_split_symbol = NULL, 14 | include_metadata = NULL, 15 | output_as_array = NULL, 16 | uid = random_string("finisher_") 17 | ) 18 | } 19 | \arguments{ 20 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 21 | 22 | \item{input_cols}{Input columns. String array.} 23 | 24 | \item{output_cols}{Output columns. String array.} 25 | 26 | \item{clean_annotations}{Boolean. Whether to remove intermediate annotations} 27 | 28 | \item{value_split_symbol}{String. Optional. Split values within an annotation character} 29 | 30 | \item{annotation_split_symbol}{String. Optional. Split values between annotations character} 31 | 32 | \item{include_metadata}{Boolean. Optional. Whether to include metadata keys. Sometimes useful in some annotations} 33 | 34 | \item{output_as_array}{Boolean. Optional. Whether to output as Array. Useful as input for other Spark transformers} 35 | 36 | \item{uid}{A character string used to uniquely identify the ML estimator.} 37 | } 38 | \value{ 39 | The object returned depends on the class of \code{x}. 40 | 41 | \itemize{ 42 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 43 | a Spark \code{Estimator} object and can be used to compose 44 | \code{Pipeline} objects. 45 | 46 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 47 | the NLP estimator appended to the pipeline. 48 | 49 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 50 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 51 | } 52 | } 53 | \description{ 54 | Spark ML transformer that outputs annotation(s) values into string. 55 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#finisher} 56 | } 57 | -------------------------------------------------------------------------------- /man/nlp_generate_assertion_train_set.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotation_tool_json_reader.R 3 | \name{nlp_generate_assertion_train_set} 4 | \alias{nlp_generate_assertion_train_set} 5 | \title{Generate an assertion training set from an AnnotationToolJsonReader} 6 | \usage{ 7 | nlp_generate_assertion_train_set( 8 | reader, 9 | df, 10 | sentence_col = "sentence", 11 | assertion_col = "assertion_label" 12 | ) 13 | } 14 | \arguments{ 15 | \item{reader}{an instance of AnnotationToolJsonReader \code{\link{nlp_annotation_tool_json_reader}}} 16 | 17 | \item{df}{a Spark Dataframe} 18 | 19 | \item{sentence_col}{the name of the sentence column} 20 | 21 | \item{assertion_col}{the name of the assertion column} 22 | } 23 | \value{ 24 | assertion training set data frame 25 | } 26 | \description{ 27 | Generate an assertion training set from an AnnotationToolJsonReader 28 | } 29 | -------------------------------------------------------------------------------- /man/nlp_generate_colln.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotation_tool_json_reader.R 3 | \name{nlp_generate_colln} 4 | \alias{nlp_generate_colln} 5 | \title{Generate a CoNLL format file from a data frame using an AnnotationToolJsonReader} 6 | \usage{ 7 | nlp_generate_colln( 8 | reader, 9 | df, 10 | path, 11 | task_col = "task_id", 12 | token_col = "token", 13 | ner_label = "ner_label" 14 | ) 15 | } 16 | \arguments{ 17 | \item{reader}{an instance of AnnotationToolJsonReader \code{\link{nlp_annotation_tool_json_reader}}} 18 | 19 | \item{df}{a Spark Dataframe} 20 | 21 | \item{task_col}{the name of the task column} 22 | 23 | \item{token_col}{the name of the token column} 24 | 25 | \item{ner_label}{the name of the ner label column} 26 | } 27 | \description{ 28 | Generate a CoNLL format file from a data frame using an AnnotationToolJsonReader 29 | } 30 | -------------------------------------------------------------------------------- /man/nlp_generate_plain_assertion_train_set.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotation_tool_json_reader.R 3 | \name{nlp_generate_plain_assertion_train_set} 4 | \alias{nlp_generate_plain_assertion_train_set} 5 | \title{Generate a plain assertion training set from an AnnotationToolJsonReader} 6 | \usage{ 7 | nlp_generate_plain_assertion_train_set( 8 | reader, 9 | df, 10 | task_col = "task_id", 11 | token_col = "token", 12 | ner_label = "ner_label", 13 | assertion_label = "assertion_label" 14 | ) 15 | } 16 | \arguments{ 17 | \item{reader}{an instance of AnnotationToolJsonReader \code{\link{nlp_annotation_tool_json_reader}}} 18 | 19 | \item{df}{a Spark Dataframe} 20 | 21 | \item{task_col}{the name of the task column} 22 | 23 | \item{token_col}{the name of the token column} 24 | 25 | \item{ner_label}{the name of the ner label column} 26 | 27 | \item{assertion_col}{the name of the assertion column} 28 | } 29 | \value{ 30 | assertion training set data frame 31 | } 32 | \description{ 33 | Generate a plain assertion training set from an AnnotationToolJsonReader 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_get_classes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{nlp_get_classes} 4 | \alias{nlp_get_classes} 5 | \title{Get classes used to train a model} 6 | \usage{ 7 | nlp_get_classes(model) 8 | } 9 | \arguments{ 10 | \item{model}{a trained SparkNLP model that implements getClasses()} 11 | } 12 | \value{ 13 | a list of classes 14 | } 15 | \description{ 16 | Get classes used to train a model 17 | } 18 | -------------------------------------------------------------------------------- /man/nlp_graph_finisher.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/graph-finisher.R 3 | \name{nlp_graph_finisher} 4 | \alias{nlp_graph_finisher} 5 | \title{Spark NLP GraphFinisher} 6 | \usage{ 7 | nlp_graph_finisher( 8 | x, 9 | input_col, 10 | output_col, 11 | clean_annotations = NULL, 12 | include_metadata = NULL, 13 | output_as_array = NULL, 14 | uid = random_string("graph_finisher_") 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 19 | 20 | \item{input_col}{Input column. String.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{clean_annotations}{Whether to remove annotation columns (Default: true)} 25 | 26 | \item{include_metadata}{Annotation metadata format (Default: false)} 27 | 28 | \item{output_as_array}{Finisher generates an Array with the results instead of string (Default: true)} 29 | 30 | \item{uid}{A character string used to uniquely identify the ML estimator.} 31 | } 32 | \value{ 33 | The object returned depends on the class of \code{x}. 34 | 35 | \itemize{ 36 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 37 | a Spark \code{Estimator} object and can be used to compose 38 | \code{Pipeline} objects. 39 | 40 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 41 | the NLP estimator appended to the pipeline. 42 | 43 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 44 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 45 | } 46 | } 47 | \description{ 48 | Helper class to convert the knowledge graph from GraphExtraction into a generic format, such as RDF. 49 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#graphfinisher} 50 | } 51 | -------------------------------------------------------------------------------- /man/nlp_language_detector_dl_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/language-detector-dl.R 3 | \name{nlp_language_detector_dl_pretrained} 4 | \alias{nlp_language_detector_dl_pretrained} 5 | \title{Load a pretrained Spark NLP LanguageDetectorDL model} 6 | \usage{ 7 | nlp_language_detector_dl_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | alphabet = NULL, 12 | coalesce_sentences = NULL, 13 | language = NULL, 14 | threshold = NULL, 15 | threshold_label = NULL, 16 | name = NULL, 17 | lang = NULL, 18 | remote_loc = NULL 19 | ) 20 | } 21 | \arguments{ 22 | \item{sc}{A Spark connection} 23 | 24 | \item{input_cols}{Input columns. String array.} 25 | 26 | \item{output_col}{Output column. String.} 27 | 28 | \item{alphabet}{alphabet used to feed the TensorFlow model for prediction (Map of string to integer) This should be an R environment} 29 | 30 | \item{coalesce_sentences}{If sets to true the output of all sentences will be averaged to one output instead of one output per sentence. (boolean)} 31 | 32 | \item{language}{used to map prediction to two-letter (ISO 639-1) language codes (Map of string to integer) This should be an R environment} 33 | 34 | \item{threshold}{The minimum threshold for the final result otheriwse it will be either Unknown or the value set in thresholdLabel.} 35 | 36 | \item{threshold_label}{In case the score of prediction is less than threshold, what should be the label.} 37 | 38 | \item{name}{the name of the model to load. If NULL will use the default value} 39 | 40 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 41 | 42 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 43 | } 44 | \value{ 45 | The Spark NLP model with the pretrained model loaded 46 | } 47 | \description{ 48 | Create a pretrained Spark NLP \code{LanguageDetectorDL} model 49 | } 50 | -------------------------------------------------------------------------------- /man/nlp_lemmatizer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lemmatizer.R 3 | \name{nlp_lemmatizer} 4 | \alias{nlp_lemmatizer} 5 | \title{Spark NLP Lemmatizer} 6 | \usage{ 7 | nlp_lemmatizer( 8 | x, 9 | input_cols, 10 | output_col, 11 | dictionary_path = NULL, 12 | dictionary_key_delimiter = "->", 13 | dictionary_value_delimiter = "\\t", 14 | dictionary_read_as = "TEXT", 15 | dictionary_options = list(format = "text"), 16 | uid = random_string("lemmatizer_") 17 | ) 18 | } 19 | \arguments{ 20 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 21 | 22 | \item{input_cols}{Input columns. String array.} 23 | 24 | \item{output_col}{Output column. String.} 25 | 26 | \item{dictionary_path}{Path to lemma dictionary, in lemma vs possible words format.} 27 | 28 | \item{dictionary_key_delimiter}{key delimiter in the dictionary file} 29 | 30 | \item{dictionary_value_delimiter}{value delimiter in the dictionary file} 31 | 32 | \item{dictionary_read_as}{readAs TEXT or SPARK_DATASET} 33 | 34 | \item{dictionary_options}{options passed to the spark reader if read_as is SPARK_DATASET} 35 | 36 | \item{uid}{A character string used to uniquely identify the ML estimator.} 37 | } 38 | \value{ 39 | The object returned depends on the class of \code{x}. 40 | 41 | \itemize{ 42 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 43 | a Spark \code{Estimator} object and can be used to compose 44 | \code{Pipeline} objects. 45 | 46 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 47 | a default pretrained NLP model appended to the pipeline. 48 | 49 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 50 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 51 | } 52 | } 53 | \description{ 54 | Spark ML estimator that retrieves lemmas out of words with the objective of returning a base dictionary word 55 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#lemmatizer} 56 | } 57 | -------------------------------------------------------------------------------- /man/nlp_lemmatizer_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lemmatizer.R 3 | \name{nlp_lemmatizer_pretrained} 4 | \alias{nlp_lemmatizer_pretrained} 5 | \title{Load a pretrained Spark NLP model} 6 | \usage{ 7 | nlp_lemmatizer_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Create a pretrained Spark NLP \code{LemmatizerModel} model 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_light_pipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/light-pipeline.R 3 | \name{nlp_light_pipeline} 4 | \alias{nlp_light_pipeline} 5 | \title{Spark NLP Light pipeline} 6 | \usage{ 7 | nlp_light_pipeline(x, parse_embeddings = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{a trained (fitted) pipeline} 11 | 12 | \item{parse_embeddings}{whether to parse the embeddings} 13 | } 14 | \value{ 15 | a LightPipeline object 16 | } 17 | \description{ 18 | LightPipelines are Spark ML pipelines converted into a single machine but multithreaded task, becoming more than 19 | 10x times faster for smaller amounts of data (small is relative, but 50k sentences is roughly a good maximum). 20 | To use them, simply plug in a trained (fitted) pipeline. 21 | } 22 | -------------------------------------------------------------------------------- /man/nlp_longformer_token_classification_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/longformer-for-token-classification.R 3 | \name{nlp_longformer_token_classification_pretrained} 4 | \alias{nlp_longformer_token_classification_pretrained} 5 | \title{Spark NLP LongformerForTokenClassification} 6 | \usage{ 7 | nlp_longformer_token_classification_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | batch_size = NULL, 12 | case_sensitive = NULL, 13 | max_sentence_length = NULL, 14 | name = NULL, 15 | lang = NULL, 16 | remote_loc = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{batch_size}{Size of every batch (Default depends on model).} 25 | 26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)} 27 | 28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)} 29 | 30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 31 | 32 | \item{uid}{A character string used to uniquely identify the ML estimator.} 33 | } 34 | \value{ 35 | The object returned depends on the class of \code{x}. 36 | 37 | \itemize{ 38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 39 | a Spark \code{Estimator} object and can be used to compose 40 | \code{Pipeline} objects. 41 | 42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 43 | the NLP estimator appended to the pipeline. 44 | 45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 46 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 47 | } 48 | } 49 | \description{ 50 | LongformerForTokenClassification can load Longformer Models with a token classification head on top 51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. 52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#longformerfortokenclassification} 53 | } 54 | -------------------------------------------------------------------------------- /man/nlp_marian_transformer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/marian_transformer.R 3 | \name{nlp_marian_transformer} 4 | \alias{nlp_marian_transformer} 5 | \title{Spark NLP MarianTransformer} 6 | \usage{ 7 | nlp_marian_transformer( 8 | x, 9 | input_cols, 10 | output_col, 11 | lang_id = NULL, 12 | max_input_length = NULL, 13 | max_output_length = NULL, 14 | vocabulary = NULL, 15 | uid = random_string("marian_transformer_") 16 | ) 17 | } 18 | \arguments{ 19 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 20 | 21 | \item{input_cols}{Input columns. String array.} 22 | 23 | \item{output_col}{Output column. String.} 24 | 25 | \item{lang_id}{A string representing the target language in the form of >>id<< (id = valid target language ID)} 26 | 27 | \item{max_input_length}{Controls the maximum length for encoder inputs (source language texts) Default: 40} 28 | 29 | \item{max_output_length}{Controls the maximum length for decoder outputs (target language texts) Default: 40} 30 | 31 | \item{vocabulary}{Vocabulary used to encode and decode piece tokens generated by SentencePiece This will be set once the model is created and cannot be changed afterwards} 32 | 33 | \item{uid}{A character string used to uniquely identify the ML estimator.} 34 | } 35 | \value{ 36 | The object returned depends on the class of \code{x}. 37 | 38 | \itemize{ 39 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 40 | a Spark \code{Estimator} object and can be used to compose 41 | \code{Pipeline} objects. 42 | 43 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 44 | the NLP estimator appended to the pipeline. 45 | 46 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 47 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 48 | } 49 | } 50 | \description{ 51 | Spark ML transformer that 52 | See \url{https://nlp.johnsnowlabs.com/api/#com.johnsnowlabs.nlp.annotators.seq2seq.MarianTransformer} 53 | } 54 | -------------------------------------------------------------------------------- /man/nlp_marian_transformer_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/marian_transformer.R 3 | \name{nlp_marian_transformer_pretrained} 4 | \alias{nlp_marian_transformer_pretrained} 5 | \title{Load a pretrained Spark NLP Marian Transformer model} 6 | \usage{ 7 | nlp_marian_transformer_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Create a pretrained Spark NLP \code{MarianTransformerModel} model 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_medical_ner_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/medical-ner.R 3 | \name{nlp_medical_ner_pretrained} 4 | \alias{nlp_medical_ner_pretrained} 5 | \title{Load a pretrained Spark NLP Medical NER model} 6 | \usage{ 7 | nlp_medical_ner_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | include_confidence = NULL, 12 | label_casing = NULL, 13 | name = NULL, 14 | lang = NULL, 15 | remote_loc = NULL 16 | ) 17 | } 18 | \arguments{ 19 | \item{sc}{A Spark connection} 20 | 21 | \item{input_cols}{Input columns. String array.} 22 | 23 | \item{output_col}{Output column. String.} 24 | 25 | \item{include_confidence}{whether to include confidence values} 26 | 27 | \item{label_casing}{Set the tag to case sensitive or not.Setting all labels of the NER models upper/lower case.} 28 | 29 | \item{name}{the name of the model to load. If NULL will use the default value} 30 | 31 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 32 | 33 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 34 | } 35 | \value{ 36 | The Spark NLP model with the pretrained model loaded 37 | } 38 | \description{ 39 | Create a pretrained Spark NLP \code{MedicalNerModel} model 40 | } 41 | -------------------------------------------------------------------------------- /man/nlp_multi_classifier_dl_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/multi_classifier_dl.R 3 | \name{nlp_multi_classifier_dl_pretrained} 4 | \alias{nlp_multi_classifier_dl_pretrained} 5 | \title{Load a pretrained Spark NLP Multilabel Classifier DL model} 6 | \usage{ 7 | nlp_multi_classifier_dl_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | threshold = NULL, 12 | name = NULL, 13 | lang = NULL, 14 | remote_loc = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{sc}{A Spark connection} 19 | 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{threshold}{the minimum threshold for each label to be accepted} 25 | 26 | \item{name}{the name of the model to load. If NULL will use the default value} 27 | 28 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 29 | 30 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 31 | } 32 | \value{ 33 | The Spark NLP model with the pretrained model loaded 34 | } 35 | \description{ 36 | Create a pretrained Spark NLP \code{MultiClassifierDLModel} model 37 | } 38 | -------------------------------------------------------------------------------- /man/nlp_ner_chunker.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ner_chunker.R 3 | \name{nlp_ner_chunker} 4 | \alias{nlp_ner_chunker} 5 | \title{Spark NLP NerChunker} 6 | \usage{ 7 | nlp_ner_chunker( 8 | x, 9 | input_cols, 10 | output_col, 11 | regex_parsers = NULL, 12 | uid = random_string("ner_chunker_") 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 17 | 18 | \item{input_cols}{Input columns. String array.} 19 | 20 | \item{output_col}{Output column. String.} 21 | 22 | \item{regex_parsers}{A list of regex patterns to match chunks, for example: Array(“‹DT›?‹JJ›*‹NN›”)} 23 | 24 | \item{uid}{A character string used to uniquely identify the ML estimator.} 25 | } 26 | \value{ 27 | The object returned depends on the class of \code{x}. 28 | 29 | \itemize{ 30 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 31 | a Spark \code{Estimator} object and can be used to compose 32 | \code{Pipeline} objects. 33 | 34 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 35 | the NLP estimator appended to the pipeline. 36 | 37 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 38 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 39 | } 40 | } 41 | \description{ 42 | Spark ML transformer that extracts phrases that fit into a known pattern using the NER tags 43 | See \url{https://nlp.johnsnowlabs.com/docs/en/licensed_release_notes#1-nerchunker} 44 | } 45 | -------------------------------------------------------------------------------- /man/nlp_ner_crf_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ner-crf.R 3 | \name{nlp_ner_crf_pretrained} 4 | \alias{nlp_ner_crf_pretrained} 5 | \title{Load a pretrained Spark NLP NER CRF model} 6 | \usage{ 7 | nlp_ner_crf_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Create a pretrained Spark NLP \code{NerCrfModel} model 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_ner_dl_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ner-dl.R 3 | \name{nlp_ner_dl_pretrained} 4 | \alias{nlp_ner_dl_pretrained} 5 | \title{Load a pretrained Spark NLP NER DL model} 6 | \usage{ 7 | nlp_ner_dl_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | include_confidence = NULL, 12 | include_all_confidence_scores = NULL, 13 | name = NULL, 14 | lang = NULL, 15 | remote_loc = NULL 16 | ) 17 | } 18 | \arguments{ 19 | \item{sc}{A Spark connection} 20 | 21 | \item{input_cols}{Input columns. String array.} 22 | 23 | \item{output_col}{Output column. String.} 24 | 25 | \item{include_confidence}{whether to include confidence values} 26 | 27 | \item{name}{the name of the model to load. If NULL will use the default value} 28 | 29 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 30 | 31 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 32 | } 33 | \value{ 34 | The Spark NLP model with the pretrained model loaded 35 | } 36 | \description{ 37 | Create a pretrained Spark NLP \code{NerDLModel} model 38 | } 39 | -------------------------------------------------------------------------------- /man/nlp_ngram_generator.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ngram-generator.R 3 | \name{nlp_ngram_generator} 4 | \alias{nlp_ngram_generator} 5 | \title{Spark NLP NGramGenerator} 6 | \usage{ 7 | nlp_ngram_generator( 8 | x, 9 | input_cols, 10 | output_col, 11 | n = NULL, 12 | enable_cumulative = NULL, 13 | delimiter = NULL, 14 | uid = random_string("ngram_generator_") 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 19 | 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{n}{number elements per n-gram (>=1)} 25 | 26 | \item{enable_cumulative}{whether to calculate just the actual n-grams or all n-grams from 1 through n} 27 | 28 | \item{delimiter}{glue character used to join the tokens} 29 | 30 | \item{uid}{A character string used to uniquely identify the ML estimator.} 31 | } 32 | \value{ 33 | The object returned depends on the class of \code{x}. 34 | 35 | \itemize{ 36 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 37 | a Spark \code{Estimator} object and can be used to compose 38 | \code{Pipeline} objects. 39 | 40 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 41 | the NLP estimator appended to the pipeline. 42 | 43 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 44 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 45 | } 46 | } 47 | \description{ 48 | Spark ML transformer that takes as input a sequence of strings (e.g. the output of a Tokenizer, Normalizer, Stemmer, 49 | Lemmatizer, and StopWordsCleaner). The parameter n is used to determine the number of terms in each n-gram. 50 | The output will consist of a sequence of n-grams where each n-gram is represented by a space-delimited string of n 51 | consecutive words with annotatorType CHUNK same as the Chunker annotator. 52 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#ngramgenerator} 53 | } 54 | -------------------------------------------------------------------------------- /man/nlp_norvig_spell_checker_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/norvig-spell-checker.R 3 | \name{nlp_norvig_spell_checker_pretrained} 4 | \alias{nlp_norvig_spell_checker_pretrained} 5 | \title{Load a pretrained Spark NLP model} 6 | \usage{ 7 | nlp_norvig_spell_checker_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Create a pretrained Spark NLP \code{NorvigSweetingModel} model 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_perceptron.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/perceptron.R 3 | \name{nlp_perceptron} 4 | \alias{nlp_perceptron} 5 | \title{Spark NLP Perceptron} 6 | \usage{ 7 | nlp_perceptron( 8 | x, 9 | input_cols, 10 | output_col, 11 | n_iterations = NULL, 12 | pos_column = NULL, 13 | uid = random_string("perceptron_") 14 | ) 15 | } 16 | \arguments{ 17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{n_iterations}{Number of iterations for training. May improve accuracy but takes longer. Default 5.} 24 | 25 | \item{pos_column}{Column containing an array of POS Tags matching every token on the line.} 26 | 27 | \item{uid}{A character string used to uniquely identify the ML estimator.} 28 | } 29 | \value{ 30 | The object returned depends on the class of \code{x}. 31 | 32 | \itemize{ 33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 34 | a Spark \code{Estimator} object and can be used to compose 35 | \code{Pipeline} objects. 36 | 37 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 38 | a default pretrained NLP model appended to the pipeline. 39 | 40 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 41 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 42 | } 43 | } 44 | \description{ 45 | Spark ML transformer that sets a POS tag to each word within a sentence. Its train data (train_pos) is a spark 46 | dataset of POS format values with Annotation columns. 47 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#postagger} 48 | } 49 | -------------------------------------------------------------------------------- /man/nlp_perceptron_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/perceptron.R 3 | \name{nlp_perceptron_pretrained} 4 | \alias{nlp_perceptron_pretrained} 5 | \title{Load a pretrained Spark NLP model} 6 | \usage{ 7 | nlp_perceptron_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Create a pretrained Spark NLP \code{Perceptron} model 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_pos.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/perceptron.R 3 | \name{nlp_pos} 4 | \alias{nlp_pos} 5 | \title{Read a part of speech tagging training file into a dataset} 6 | \usage{ 7 | nlp_pos( 8 | sc, 9 | file_path, 10 | delimiter = NULL, 11 | output_pos_col = NULL, 12 | output_document_col = NULL, 13 | output_text_col = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{Spark connection} 18 | 19 | \item{file_path}{path to the text file with the training data} 20 | 21 | \item{delimiter}{the delimiter used in the training data} 22 | 23 | \item{output_pos_col}{the pos column name for the output data frame} 24 | 25 | \item{output_document_col}{the document column name for the output data frame} 26 | 27 | \item{output_text_col}{the text column name for the output data frame} 28 | } 29 | \value{ 30 | Spark dataframe containing the data 31 | } 32 | \description{ 33 | In order to train a Part of Speech Tagger annotator, we need to get corpus data as a spark dataframe. 34 | This function does this: it reads a plain text file and transforms it to a spark dataset that is ready 35 | for training a POS tagger. 36 | See the Scala API docs for the default parameter values ( 37 | \url{https://nlp.johnsnowlabs.com/api/index.html#com.johnsnowlabs.nlp.training.POS)} 38 | } 39 | -------------------------------------------------------------------------------- /man/nlp_pretrained_pipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pretrained-pipeline.R 3 | \name{nlp_pretrained_pipeline} 4 | \alias{nlp_pretrained_pipeline} 5 | \title{Spark NLP Pretrained pipeline} 6 | \usage{ 7 | nlp_pretrained_pipeline( 8 | x, 9 | download_name, 10 | lang = "en", 11 | source = "public/models", 12 | parse_embeddings_vectors = FALSE, 13 | disk_location = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{x}{a Spark connection, Spark dataframe or string or character vector} 18 | 19 | \item{download_name}{the name of the pretrained pipeline to download and create} 20 | 21 | \item{lang}{the language of the pipeline} 22 | 23 | \item{source}{the source for the pipeline file} 24 | 25 | \item{parse_embeddings_vectors}{whether to parse the embeddings vectors or not} 26 | 27 | \item{disk_location}{optional location on disk that the pipeline should be loaded from} 28 | } 29 | \value{ 30 | The object returned depends on the class of \code{x}. 31 | 32 | \itemize{ 33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of 34 | a \code{ml_pipeline} created from the pretrained pipeline. 35 | 36 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, a the pretrained pipeline is created and immediately 37 | run on the provied dataframe using \code{ml_fit_and_transform} returning the transformed data frame. 38 | } 39 | } 40 | \description{ 41 | Creates a Spark NLP pretrained pipeline. See 42 | \url{https://nlp.johnsnowlabs.com/api/index.html#com.johnsnowlabs.nlp.pretrained.PretrainedPipeline} for the 43 | default values for the parameters if left null 44 | } 45 | -------------------------------------------------------------------------------- /man/nlp_pubtator_read_dataset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{nlp_pubtator_read_dataset} 4 | \alias{nlp_pubtator_read_dataset} 5 | \title{PubTator Dataset} 6 | \usage{ 7 | nlp_pubtator_read_dataset(sc, path) 8 | } 9 | \arguments{ 10 | \item{sc}{Spark connection} 11 | 12 | \item{path}{path to a PubTator file} 13 | } 14 | \value{ 15 | Spark Dataframe created from the PubTator file 16 | } 17 | \description{ 18 | The PubTator format includes medical papers’ titles, abstracts, and tagged chunks 19 | (see \href{http://bioportal.bioontology.org/ontologies/EDAM?p=classes&conceptid=format_3783}{PubTator Docs} and 20 | \href{http://github.com/chanzuckerberg/MedMentions}{MedMentions Docs} 21 | for more information). We can create a Spark DataFrame from a PubTator text file. 22 | } 23 | -------------------------------------------------------------------------------- /man/nlp_re_ner_chunks_filter.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/re_ner_chunks_filter.R 3 | \name{nlp_re_ner_chunks_filter} 4 | \alias{nlp_re_ner_chunks_filter} 5 | \title{Spark NLP RENerChunksFilter} 6 | \usage{ 7 | nlp_re_ner_chunks_filter( 8 | x, 9 | input_cols, 10 | output_col, 11 | max_syntactic_distance = NULL, 12 | relation_pairs, 13 | uid = random_string("re_ner_chunks_filter_") 14 | ) 15 | } 16 | \arguments{ 17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{max_syntactic_distance}{Maximal syntactic distance, as threshold (Default: 0)} 24 | 25 | \item{relation_pairs}{List of dash-separated pairs of named entities 26 | ("ENTITY1-ENTITY2", e.g. "Biomarker-RelativeDay"), which will be processed} 27 | 28 | \item{uid}{A character string used to uniquely identify the ML estimator.} 29 | } 30 | \value{ 31 | The object returned depends on the class of \code{x}. 32 | 33 | \itemize{ 34 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 35 | a Spark \code{Estimator} object and can be used to compose 36 | \code{Pipeline} objects. 37 | 38 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 39 | the NLP estimator appended to the pipeline. 40 | 41 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 42 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 43 | } 44 | } 45 | \description{ 46 | Spark ML transformer that filters and outputs combinations of relations between 47 | extracted entities, for further processing. This annotator is especially useful 48 | to create inputs for the RelationExtractionDLModel. 49 | } 50 | \details{ 51 | See \url{https://nlp.johnsnowlabs.com/docs/en/licensed_annotators#renerchunksfilter} 52 | } 53 | -------------------------------------------------------------------------------- /man/nlp_recursive_pipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/recursive-pipeline.R 3 | \name{nlp_recursive_pipeline} 4 | \alias{nlp_recursive_pipeline} 5 | \title{Spark NLP RecursivePipeline} 6 | \usage{ 7 | nlp_recursive_pipeline(x, ..., uid = random_string("recursive_pipeline_")) 8 | } 9 | \arguments{ 10 | \item{x}{Either a \code{spark_connection} or \code{ml_pipeline_stage} objects} 11 | 12 | \item{...}{\code{ml_pipeline_stage} objects} 13 | 14 | \item{uid}{uid for the pipeline} 15 | } 16 | \value{ 17 | When \code{x} is a \code{spark_connection}, \code{ml_pipeline()} returns an empty pipeline object. 18 | When \code{x} is a \code{ml_pipeline_stage}, \code{ml_pipeline()} returns an \code{ml_pipeline} with the stages 19 | set to \code{x} and any transformers or estimators given in \code{...}. 20 | } 21 | \description{ 22 | Recursive pipelines are SparkNLP specific pipelines that allow a Spark ML Pipeline to know about itself on every 23 | Pipeline Stage task, allowing annotators to utilize this same pipeline against external resources to process them 24 | in the same way the user decides. Only some of our annotators take advantage of this. RecursivePipeline behaves 25 | exactly the same than normal Spark ML pipelines, so they can be used with the same intention. 26 | See \url{https://nlp.johnsnowlabs.com/docs/en/concepts#recursivepipeline} 27 | } 28 | -------------------------------------------------------------------------------- /man/nlp_recursive_tokenizer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/recursive-tokenizer.R 3 | \name{nlp_recursive_tokenizer} 4 | \alias{nlp_recursive_tokenizer} 5 | \title{Spark NLP RecursiveTokenizer} 6 | \usage{ 7 | nlp_recursive_tokenizer( 8 | x, 9 | input_cols, 10 | output_col, 11 | infixes = NULL, 12 | prefixes = NULL, 13 | suffixes = NULL, 14 | white_list = NULL, 15 | uid = random_string("recursive_tokenizer_") 16 | ) 17 | } 18 | \arguments{ 19 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 20 | 21 | \item{input_cols}{Input columns. String array.} 22 | 23 | \item{output_col}{Output column. String.} 24 | 25 | \item{infixes}{strings that will be split when found at the middle of a token} 26 | 27 | \item{prefixes}{strings that will be split when found at the beginning of a token} 28 | 29 | \item{suffixes}{strings that will be split when found at the end of a token} 30 | 31 | \item{white_list}{whitelist} 32 | 33 | \item{uid}{A character string used to uniquely identify the ML estimator.} 34 | } 35 | \value{ 36 | The object returned depends on the class of \code{x}. 37 | 38 | \itemize{ 39 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 40 | a Spark \code{Estimator} object and can be used to compose 41 | \code{Pipeline} objects. 42 | 43 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 44 | the NLP estimator appended to the pipeline. 45 | 46 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 47 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 48 | } 49 | } 50 | \description{ 51 | Spark ML model that tokenizes 52 | See \url{https://nlp.johnsnowlabs.com/api/index#com.johnsnowlabs.nlp.annotators.RecursiveTokenizer} 53 | } 54 | -------------------------------------------------------------------------------- /man/nlp_regex_matcher.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/regex_matcher.R 3 | \name{nlp_regex_matcher} 4 | \alias{nlp_regex_matcher} 5 | \title{Spark NLP RegexMatcher} 6 | \usage{ 7 | nlp_regex_matcher( 8 | x, 9 | input_cols, 10 | output_col, 11 | strategy = NULL, 12 | rules_path, 13 | rules_path_delimiter, 14 | rules_path_read_as = "TEXT", 15 | rules_path_options = list(format = "text"), 16 | uid = random_string("regex_matcher_") 17 | ) 18 | } 19 | \arguments{ 20 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 21 | 22 | \item{input_cols}{Input columns. String array.} 23 | 24 | \item{output_col}{Output column. String.} 25 | 26 | \item{strategy}{Can be any of MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE} 27 | 28 | \item{rules_path}{Path to file containing a set of regex,key pair} 29 | 30 | \item{rules_path_delimiter}{delimiter between regex and key in the file} 31 | 32 | \item{rules_path_read_as}{TEXT or SPARK_DATASET} 33 | 34 | \item{rules_path_options}{options passed to Spark reader if read_as is SPARK_DATASET} 35 | 36 | \item{uid}{A character string used to uniquely identify the ML estimator.} 37 | } 38 | \value{ 39 | The object returned depends on the class of \code{x}. 40 | 41 | \itemize{ 42 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 43 | a Spark \code{Estimator} object and can be used to compose 44 | \code{Pipeline} objects. 45 | 46 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 47 | the NLP estimator appended to the pipeline. 48 | 49 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 50 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 51 | } 52 | } 53 | \description{ 54 | Spark ML estimator that 55 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#regexmatcher} 56 | } 57 | -------------------------------------------------------------------------------- /man/nlp_relation_extraction_dl_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/relation_extraction_dl.R 3 | \name{nlp_relation_extraction_dl_pretrained} 4 | \alias{nlp_relation_extraction_dl_pretrained} 5 | \title{Load a pretrained Spark NLP Relation Extraction DL model} 6 | \usage{ 7 | nlp_relation_extraction_dl_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | prediction_threshold = NULL, 12 | name = NULL, 13 | lang = NULL, 14 | remote_loc = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{sc}{A Spark connection} 19 | 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{prediction_threshold}{Minimal activation of the target unit to encode a new relation instance (Default: 0.5f)} 25 | 26 | \item{name}{the name of the model to load. If NULL will use the default value} 27 | 28 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 29 | 30 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 31 | } 32 | \value{ 33 | The Spark NLP model with the pretrained model loaded 34 | } 35 | \description{ 36 | Create a pretrained Spark NLP \code{RelationExtractionDLModel} model 37 | } 38 | -------------------------------------------------------------------------------- /man/nlp_relation_extraction_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/relation_extraction.R 3 | \name{nlp_relation_extraction_pretrained} 4 | \alias{nlp_relation_extraction_pretrained} 5 | \title{Load a pretrained Spark NLP Relation Extraction model} 6 | \usage{ 7 | nlp_relation_extraction_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | relation_pairs, 12 | max_syntactic_distance = NULL, 13 | feature_scaling = NULL, 14 | prediction_threshold = NULL, 15 | name = NULL, 16 | lang = NULL, 17 | remote_loc = NULL 18 | ) 19 | } 20 | \arguments{ 21 | \item{sc}{A Spark connection} 22 | 23 | \item{input_cols}{Input columns. String array.} 24 | 25 | \item{output_col}{Output column. String.} 26 | 27 | \item{relation_pairs}{List of dash-separated pairs of named entities ("ENTITY1-ENTITY2", 28 | e.g. "Biomarker-RelativeDay"), which will be processed} 29 | 30 | \item{max_syntactic_distance}{Maximal syntactic distance, as threshold (Default: 0)} 31 | 32 | \item{feature_scaling}{Feature scaling method.} 33 | 34 | \item{prediction_threshold}{Minimal activation of the target unit to encode a new relation instance (Default: 0.5f)} 35 | 36 | \item{name}{the name of the model to load. If NULL will use the default value} 37 | 38 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 39 | 40 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 41 | } 42 | \value{ 43 | The Spark NLP model with the pretrained model loaded 44 | } 45 | \description{ 46 | Create a pretrained Spark NLP \code{RelationExtractionModel} model 47 | } 48 | -------------------------------------------------------------------------------- /man/nlp_resource_downloader.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/resource_downloader.R 3 | \name{nlp_resource_downloader} 4 | \alias{nlp_resource_downloader} 5 | \alias{ResourceDownloader} 6 | \alias{nlp_show_public_pipelines} 7 | \alias{nlp_show_public_models} 8 | \alias{nlp_clear_cache} 9 | \alias{nlp_show_available_annotators} 10 | \title{SparkNLP ResourceDownloader functions} 11 | \usage{ 12 | nlp_show_public_pipelines(sc, lang = NULL, version = NULL) 13 | 14 | nlp_show_public_models(sc, annotator = NULL, lang = NULL, version = NULL) 15 | 16 | nlp_clear_cache(sc, name = NULL, language = NULL, remote_loc = NULL) 17 | 18 | nlp_show_available_annotators(sc) 19 | } 20 | \arguments{ 21 | \item{sc}{a spark_connect object} 22 | 23 | \item{lang}{language to restrict the results to} 24 | 25 | \item{version}{Spark NLP version to restrict results to} 26 | 27 | \item{annotator}{name of annotator to restrict results} 28 | 29 | \item{name}{name of object to clear} 30 | 31 | \item{language}{language to clear} 32 | 33 | \item{remote_loc}{remote_loc of models to clear} 34 | } 35 | \value{ 36 | a markdown table containing the models or pipelines filtered by the provided arguments 37 | } 38 | \description{ 39 | ResourceDownloader provides functions to easily look for pretrained models & pipelines 40 | inside Spark NLP. You can filter models or pipelines via language, version, 41 | or the name of the annotator 42 | } 43 | -------------------------------------------------------------------------------- /man/nlp_roberta_embeddings_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/roberta-embeddings.R 3 | \name{nlp_roberta_embeddings_pretrained} 4 | \alias{nlp_roberta_embeddings_pretrained} 5 | \title{Spark NLP RoBertaEmbeddings} 6 | \usage{ 7 | nlp_roberta_embeddings_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | batch_size = NULL, 12 | case_sensitive = NULL, 13 | dimension = NULL, 14 | max_sentence_length = NULL, 15 | storage_ref = NULL, 16 | name = NULL, 17 | lang = NULL, 18 | remote_loc = NULL 19 | ) 20 | } 21 | \arguments{ 22 | \item{input_cols}{Input columns. String array.} 23 | 24 | \item{output_col}{Output column. String.} 25 | 26 | \item{batch_size}{Size of every batch (Default depends on model).} 27 | 28 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)} 29 | 30 | \item{dimension}{Number of embedding dimensions (Default depends on model)} 31 | 32 | \item{max_sentence_length}{Max sentence length to process (Default: 128)} 33 | 34 | \item{storage_ref}{Unique identifier for storage (Default: this.uid)} 35 | 36 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 37 | 38 | \item{uid}{A character string used to uniquely identify the ML estimator.} 39 | } 40 | \value{ 41 | The object returned depends on the class of \code{x}. 42 | 43 | \itemize{ 44 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 45 | a Spark \code{Estimator} object and can be used to compose 46 | \code{Pipeline} objects. 47 | 48 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 49 | the NLP estimator appended to the pipeline. 50 | 51 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 52 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 53 | } 54 | } 55 | \description{ 56 | Spark ML transformer that 57 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#robertaembeddings} 58 | } 59 | -------------------------------------------------------------------------------- /man/nlp_roberta_sentence_embeddings_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/roberta_sentence_embeddings.R 3 | \name{nlp_roberta_sentence_embeddings_pretrained} 4 | \alias{nlp_roberta_sentence_embeddings_pretrained} 5 | \title{Load a pretrained Spark NLP RoBertaSentenceEmbeddings model} 6 | \usage{ 7 | nlp_roberta_sentence_embeddings_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | case_sensitive = NULL, 12 | batch_size = NULL, 13 | dimension = NULL, 14 | max_sentence_length = NULL, 15 | name = NULL, 16 | lang = NULL, 17 | remote_loc = NULL 18 | ) 19 | } 20 | \arguments{ 21 | \item{sc}{A Spark connection} 22 | 23 | \item{input_cols}{Input columns. String array.} 24 | 25 | \item{output_col}{Output column. String.} 26 | 27 | \item{case_sensitive}{whether to lowercase tokens or not} 28 | 29 | \item{batch_size}{batch size} 30 | 31 | \item{dimension}{defines the output layer of BERT when calculating embeddings} 32 | 33 | \item{max_sentence_length}{max sentence length to process} 34 | 35 | \item{name}{the name of the model to load. If NULL will use the default value} 36 | 37 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 38 | 39 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 40 | } 41 | \value{ 42 | The Spark NLP model with the pretrained model loaded 43 | } 44 | \description{ 45 | Create a pretrained Spark NLP \code{RoBertaSentenceEmbeddings} model. 46 | Sentence-level embeddings using RoBERTa. The RoBERTa model was proposed in 47 | RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott, 48 | Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, 49 | Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model 50 | released in 2018. 51 | } 52 | \details{ 53 | It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with much larger mini-batches and learning rates. 54 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#robertabertsentenceembeddings} 55 | } 56 | -------------------------------------------------------------------------------- /man/nlp_roberta_token_classification_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/roberta-for-token-classification.R 3 | \name{nlp_roberta_token_classification_pretrained} 4 | \alias{nlp_roberta_token_classification_pretrained} 5 | \title{Spark NLP RoBertaForTokenClassification} 6 | \usage{ 7 | nlp_roberta_token_classification_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | batch_size = NULL, 12 | case_sensitive = NULL, 13 | max_sentence_length = NULL, 14 | name = NULL, 15 | lang = NULL, 16 | remote_loc = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{batch_size}{Size of every batch (Default depends on model).} 25 | 26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)} 27 | 28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)} 29 | 30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 31 | 32 | \item{uid}{A character string used to uniquely identify the ML estimator.} 33 | } 34 | \value{ 35 | The object returned depends on the class of \code{x}. 36 | 37 | \itemize{ 38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 39 | a Spark \code{Estimator} object and can be used to compose 40 | \code{Pipeline} objects. 41 | 42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 43 | the NLP estimator appended to the pipeline. 44 | 45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 46 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 47 | } 48 | } 49 | \description{ 50 | RoBertaForTokenClassification can load RoBERTa Models with a token classification head on top 51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. 52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#xlnetfortokenclassification} 53 | } 54 | -------------------------------------------------------------------------------- /man/nlp_sentence_detector_dl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sentence_detector_dl.R 3 | \name{nlp_sentence_detector_dl} 4 | \alias{nlp_sentence_detector_dl} 5 | \title{Spark NLP SentenceDetectorDLApproach} 6 | \usage{ 7 | nlp_sentence_detector_dl( 8 | x, 9 | input_cols, 10 | output_col, 11 | epochs_number = NULL, 12 | impossible_penultimates = NULL, 13 | model = NULL, 14 | output_logs_path = NULL, 15 | validation_split = NULL, 16 | explode_sentences = NULL, 17 | uid = random_string("sentence_detector_dl_") 18 | ) 19 | } 20 | \arguments{ 21 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 22 | 23 | \item{input_cols}{Input columns. String array.} 24 | 25 | \item{output_col}{Output column. String.} 26 | 27 | \item{epochs_number}{maximum number of epochs to train} 28 | 29 | \item{impossible_penultimates}{impossible penultimates} 30 | 31 | \item{model}{model architecture} 32 | 33 | \item{output_logs_path}{path to folder to output logs} 34 | 35 | \item{validation_split}{choose the proportion of training dataset to be validated agaisnt the model on each epoch} 36 | 37 | \item{explode_sentences}{a flag indicating whether to split sentences into different Dataset rows.} 38 | 39 | \item{uid}{A character string used to uniquely identify the ML estimator.} 40 | } 41 | \value{ 42 | The object returned depends on the class of \code{x}. 43 | 44 | \itemize{ 45 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 46 | a Spark \code{Estimator} object and can be used to compose 47 | \code{Pipeline} objects. 48 | 49 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 50 | the NLP estimator appended to the pipeline. 51 | 52 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 53 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 54 | } 55 | } 56 | \description{ 57 | Spark ML estimator that 58 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators} 59 | } 60 | -------------------------------------------------------------------------------- /man/nlp_sentence_detector_dl_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sentence_detector_dl.R 3 | \name{nlp_sentence_detector_dl_pretrained} 4 | \alias{nlp_sentence_detector_dl_pretrained} 5 | \title{Load a pretrained Spark NLP Sentence Detector DL model} 6 | \usage{ 7 | nlp_sentence_detector_dl_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | impossible_penultimates = NULL, 12 | model = NULL, 13 | explode_sentences = NULL, 14 | name = NULL, 15 | lang = NULL, 16 | remote_loc = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{sc}{A Spark connection} 21 | 22 | \item{input_cols}{Input columns. String array.} 23 | 24 | \item{output_col}{Output column. String.} 25 | 26 | \item{impossible_penultimates}{impossible penultimates} 27 | 28 | \item{model}{model architecture} 29 | 30 | \item{explode_sentences}{a flag indicating whether to split sentences into different Dataset rows} 31 | 32 | \item{name}{the name of the model to load. If NULL will use the default value} 33 | 34 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 35 | 36 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 37 | } 38 | \value{ 39 | The Spark NLP model with the pretrained model loaded 40 | } 41 | \description{ 42 | Create a pretrained Spark NLP \code{SentenceDetectorDLModel} model 43 | } 44 | -------------------------------------------------------------------------------- /man/nlp_sentence_embeddings.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sentence-embeddings.R 3 | \name{nlp_sentence_embeddings} 4 | \alias{nlp_sentence_embeddings} 5 | \title{Spark NLP SentenceEmbeddings} 6 | \usage{ 7 | nlp_sentence_embeddings( 8 | x, 9 | input_cols, 10 | output_col, 11 | pooling_strategy = NULL, 12 | storage_ref = NULL, 13 | uid = random_string("sentence_embeddings_") 14 | ) 15 | } 16 | \arguments{ 17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{pooling_strategy}{Choose how you would like to aggregate Word Embeddings to Sentence Embeddings: AVERAGE or SUM} 24 | 25 | \item{storage_ref}{storage reference for the embeddings} 26 | 27 | \item{uid}{A character string used to uniquely identify the ML estimator.} 28 | } 29 | \value{ 30 | The object returned depends on the class of \code{x}. 31 | 32 | \itemize{ 33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 34 | a Spark \code{Estimator} object and can be used to compose 35 | \code{Pipeline} objects. 36 | 37 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 38 | the NLP estimator appended to the pipeline. 39 | 40 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 41 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 42 | } 43 | } 44 | \description{ 45 | Spark ML transformer that converts the results from WordEmbeddings or BertEmbeddings into sentence or document 46 | embeddings by either summing up or averaging all the word embeddings in a sentence or a document 47 | (depending on the input_cols). 48 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#sentenceembeddings} 49 | } 50 | -------------------------------------------------------------------------------- /man/nlp_sentence_entity_resolver_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sentence_entity_resolver.R 3 | \name{nlp_sentence_entity_resolver_pretrained} 4 | \alias{nlp_sentence_entity_resolver_pretrained} 5 | \title{Load a pretrained Spark NLP T5 Transformer model} 6 | \usage{ 7 | nlp_sentence_entity_resolver_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | case_sensitive = NULL, 12 | confidence_function = NULL, 13 | distance_function = NULL, 14 | miss_as_empty = NULL, 15 | neighbors = NULL, 16 | threshold = NULL, 17 | name = NULL, 18 | lang = NULL, 19 | remote_loc = NULL 20 | ) 21 | } 22 | \arguments{ 23 | \item{sc}{A Spark connection} 24 | 25 | \item{input_cols}{Input columns. String array.} 26 | 27 | \item{output_col}{Output column. String.} 28 | 29 | \item{case_sensitive}{whether to treat the entities as case sensitive} 30 | 31 | \item{confidence_function}{what function to use to calculate confidence: INVERSE or SOFTMAX} 32 | 33 | \item{distance_function}{what distance function to use for KNN: 'EUCLIDEAN' or 'COSINE'} 34 | 35 | \item{miss_as_empty}{whether or not to return an empty annotation on unmatched chunks} 36 | 37 | \item{neighbors}{number of neighbours to consider in the KNN query to calculate WMD} 38 | 39 | \item{threshold}{threshold value for the aggregated distance} 40 | 41 | \item{name}{the name of the model to load. If NULL will use the default value} 42 | 43 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 44 | 45 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 46 | } 47 | \value{ 48 | The Spark NLP model with the pretrained model loaded 49 | } 50 | \description{ 51 | Create a pretrained Spark NLP \code{T5TransformerModel} model 52 | } 53 | -------------------------------------------------------------------------------- /man/nlp_sentiment_dl_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sentiment-dl.R 3 | \name{nlp_sentiment_dl_pretrained} 4 | \alias{nlp_sentiment_dl_pretrained} 5 | \title{Load a pretrained Spark NLP Sentiment DL model} 6 | \usage{ 7 | nlp_sentiment_dl_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Create a pretrained Spark NLP \code{SentimentDLModel} model 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_set_input_cols.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{nlp_set_input_cols} 4 | \alias{nlp_set_input_cols} 5 | \title{Set the input column names} 6 | \usage{ 7 | nlp_set_input_cols(jobj, input_cols) 8 | } 9 | \arguments{ 10 | \item{jobj}{the object setting the input columns on} 11 | 12 | \item{input_cols}{the input column names} 13 | } 14 | \value{ 15 | the jobj object with the input columns set 16 | } 17 | \description{ 18 | Set the input column names 19 | } 20 | -------------------------------------------------------------------------------- /man/nlp_set_output_col.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{nlp_set_output_col} 4 | \alias{nlp_set_output_col} 5 | \title{Set the output column name} 6 | \usage{ 7 | nlp_set_output_col(jobj, output_col) 8 | } 9 | \arguments{ 10 | \item{jobj}{the object setting the input columns on} 11 | 12 | \item{output_col}{the input column name} 13 | } 14 | \value{ 15 | the jobj object with the output column set 16 | } 17 | \description{ 18 | Set the output column name 19 | } 20 | -------------------------------------------------------------------------------- /man/nlp_set_param.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{nlp_set_param} 4 | \alias{nlp_set_param} 5 | \title{Set a parameter on an NLP model object} 6 | \usage{ 7 | nlp_set_param(x, param, value) 8 | } 9 | \arguments{ 10 | \item{x}{A Spark NLP object, either a pipeline stage or an annotator} 11 | 12 | \item{param}{The parameter to set} 13 | 14 | \item{value}{The value to use when setting the parameter} 15 | } 16 | \value{ 17 | the NLP model object with the parameter set 18 | } 19 | \description{ 20 | Set a parameter on an NLP model object 21 | } 22 | -------------------------------------------------------------------------------- /man/nlp_set_param_tuple2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{nlp_set_param_tuple2} 4 | \alias{nlp_set_param_tuple2} 5 | \title{Set a Tuple2 parameter on an NLP model object} 6 | \usage{ 7 | nlp_set_param_tuple2(x, param, value) 8 | } 9 | \arguments{ 10 | \item{x}{A Spark NLP object, either a pipeline stage or an annotator} 11 | 12 | \item{param}{The parameter to set} 13 | 14 | \item{value}{The value to use when setting the parameter. This should be a list of size 2} 15 | } 16 | \value{ 17 | the NLP model object with the parameter set 18 | } 19 | \description{ 20 | Set a Tuple2 parameter on an NLP model object 21 | } 22 | -------------------------------------------------------------------------------- /man/nlp_spark_annotation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotation.R 3 | \name{nlp_spark_annotation} 4 | \alias{nlp_spark_annotation} 5 | \title{Create a Spark NLP Annotation object inside of Spark} 6 | \usage{ 7 | nlp_spark_annotation( 8 | sc, 9 | annotatorType, 10 | begin, 11 | end, 12 | result, 13 | metadata, 14 | embeddings = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{sc}{A \code{spark_connection}} 19 | 20 | \item{annotatorType}{the type of annotation (string)} 21 | 22 | \item{begin}{the index of the first character under this annotation (integer)} 23 | 24 | \item{end}{the index after the last character under this annotation (integer)} 25 | 26 | \item{result}{the main output of the annotation (string)} 27 | 28 | \item{metadata}{associated metadata for this annotation (named list)} 29 | 30 | \item{embeddings}{vector of embeddings (Array(Float)). Currently unimplemented.} 31 | } 32 | \value{ 33 | the Spark NLP Annotation object 34 | } 35 | \description{ 36 | This S3 generic is used for a Spark NLP Annotation object that exists inside of 37 | a Spark session. 38 | } 39 | \seealso{ 40 | \url{https://nlp.johnsnowlabs.com/docs/en/concepts#annotation} 41 | } 42 | -------------------------------------------------------------------------------- /man/nlp_stemmer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stemmer.R 3 | \name{nlp_stemmer} 4 | \alias{nlp_stemmer} 5 | \title{Spark NLP Stemmer} 6 | \usage{ 7 | nlp_stemmer( 8 | x, 9 | input_cols, 10 | output_col, 11 | language = NULL, 12 | uid = random_string("stemmer_") 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 17 | 18 | \item{input_cols}{Input columns. String array.} 19 | 20 | \item{output_col}{Output column. String.} 21 | 22 | \item{language}{language to use} 23 | 24 | \item{uid}{A character string used to uniquely identify the ML estimator.} 25 | } 26 | \value{ 27 | The object returned depends on the class of \code{x}. 28 | 29 | \itemize{ 30 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 31 | a Spark \code{Estimator} object and can be used to compose 32 | \code{Pipeline} objects. 33 | 34 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 35 | the NLP estimator appended to the pipeline. 36 | 37 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 38 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 39 | } 40 | } 41 | \description{ 42 | Spark ML transformer that returns hard-stems out of words with the objective of retrieving the meaningful part of the word 43 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#stemmer} 44 | } 45 | -------------------------------------------------------------------------------- /man/nlp_stop_words_cleaner.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stop_words_cleaner.R 3 | \name{nlp_stop_words_cleaner} 4 | \alias{nlp_stop_words_cleaner} 5 | \title{Spark NLP StopWordsCleaner} 6 | \usage{ 7 | nlp_stop_words_cleaner( 8 | x, 9 | input_cols, 10 | output_col, 11 | case_sensitive = NULL, 12 | locale = NULL, 13 | stop_words = NULL, 14 | uid = random_string("stop_words_cleaner_") 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 19 | 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{case_sensitive}{Whether to do a case sensitive comparison over the stop words.} 25 | 26 | \item{locale}{Locale of the input for case insensitive matching. Ignored when caseSensitive is true.} 27 | 28 | \item{stop_words}{The words to be filtered out.} 29 | 30 | \item{uid}{A character string used to uniquely identify the ML estimator.} 31 | } 32 | \value{ 33 | The object returned depends on the class of \code{x}. 34 | 35 | \itemize{ 36 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 37 | a Spark \code{Estimator} object and can be used to compose 38 | \code{Pipeline} objects. 39 | 40 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 41 | the NLP estimator appended to the pipeline. 42 | 43 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 44 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 45 | } 46 | } 47 | \description{ 48 | Spark ML transformer that excludes from a sequence of strings (e.g. the output of a Tokenizer, Normalizer, 49 | Lemmatizer, and Stemmer) and drops all the stop words from the input sequences. 50 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#stopwordscleaner} 51 | } 52 | -------------------------------------------------------------------------------- /man/nlp_symmetric_delete_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/symmetric-delete.R 3 | \name{nlp_symmetric_delete_pretrained} 4 | \alias{nlp_symmetric_delete_pretrained} 5 | \title{Load a pretrained Spark NLP model} 6 | \usage{ 7 | nlp_symmetric_delete_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Create a pretrained Spark NLP \code{SymmetricDeleteModel} model 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_t5_transformer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/t5_transformer.R 3 | \name{nlp_t5_transformer} 4 | \alias{nlp_t5_transformer} 5 | \title{Spark NLP T5Transformer} 6 | \usage{ 7 | nlp_t5_transformer( 8 | x, 9 | input_cols, 10 | output_col, 11 | task = NULL, 12 | max_output_length = NULL, 13 | uid = random_string("t5_transformer_") 14 | ) 15 | } 16 | \arguments{ 17 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{task}{name to give the task being performed} 24 | 25 | \item{max_output_length}{maximum output length} 26 | 27 | \item{uid}{A character string used to uniquely identify the ML estimator.} 28 | } 29 | \value{ 30 | The object returned depends on the class of \code{x}. 31 | 32 | \itemize{ 33 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 34 | a Spark \code{Estimator} object and can be used to compose 35 | \code{Pipeline} objects. 36 | 37 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 38 | the NLP estimator appended to the pipeline. 39 | 40 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 41 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 42 | } 43 | } 44 | \description{ 45 | Spark ML transformer that 46 | See \url{https://nlp.johnsnowlabs.com/api/#com.johnsnowlabs.nlp.annotators.seq2seq.T5Transformer} 47 | } 48 | -------------------------------------------------------------------------------- /man/nlp_t5_transformer_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/t5_transformer.R 3 | \name{nlp_t5_transformer_pretrained} 4 | \alias{nlp_t5_transformer_pretrained} 5 | \title{Load a pretrained Spark NLP T5 Transformer model} 6 | \usage{ 7 | nlp_t5_transformer_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | task = NULL, 12 | max_output_length = NULL, 13 | name = NULL, 14 | lang = NULL, 15 | remote_loc = NULL 16 | ) 17 | } 18 | \arguments{ 19 | \item{sc}{A Spark connection} 20 | 21 | \item{input_cols}{Input columns. String array.} 22 | 23 | \item{output_col}{Output column. String.} 24 | 25 | \item{task}{name to give the task being performed} 26 | 27 | \item{max_output_length}{the maximum output length} 28 | 29 | \item{name}{the name of the model to load. If NULL will use the default value} 30 | 31 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 32 | 33 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 34 | } 35 | \value{ 36 | The Spark NLP model with the pretrained model loaded 37 | } 38 | \description{ 39 | Create a pretrained Spark NLP \code{T5TransformerModel} model 40 | } 41 | -------------------------------------------------------------------------------- /man/nlp_token_assembler.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/token-assembler.R 3 | \name{nlp_token_assembler} 4 | \alias{nlp_token_assembler} 5 | \title{Spark NLP TokenAssembler} 6 | \usage{ 7 | nlp_token_assembler( 8 | x, 9 | input_cols, 10 | output_col, 11 | uid = random_string("token_assembler_") 12 | ) 13 | } 14 | \arguments{ 15 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 16 | 17 | \item{input_cols}{Input columns. String array.} 18 | 19 | \item{output_col}{Output column. String.} 20 | 21 | \item{uid}{A character string used to uniquely identify the ML estimator.} 22 | } 23 | \value{ 24 | The object returned depends on the class of \code{x}. 25 | 26 | \itemize{ 27 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 28 | a Spark \code{Estimator} object and can be used to compose 29 | \code{Pipeline} objects. 30 | 31 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 32 | the NLP estimator appended to the pipeline. 33 | 34 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 35 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 36 | } 37 | } 38 | \description{ 39 | Spark ML transformer that 40 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#tokenassembler-getting-data-reshaped} 41 | } 42 | -------------------------------------------------------------------------------- /man/nlp_typed_dependency_parser_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/typed-dependency-parser.R 3 | \name{nlp_typed_dependency_parser_pretrained} 4 | \alias{nlp_typed_dependency_parser_pretrained} 5 | \title{Load a pretrained Spark NLP model} 6 | \usage{ 7 | nlp_typed_dependency_parser_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Create a pretrained Spark NLP \code{TypedDependencyParserModel} model 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_univ_sent_encoder.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/univ_sent_encoder.R 3 | \name{nlp_univ_sent_encoder} 4 | \alias{nlp_univ_sent_encoder} 5 | \title{Spark NLP UniversalSentenceEncoder} 6 | \usage{ 7 | nlp_univ_sent_encoder( 8 | x, 9 | input_cols, 10 | output_col, 11 | dimension = NULL, 12 | uid = random_string("univ_sent_encoder_") 13 | ) 14 | } 15 | \arguments{ 16 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 17 | 18 | \item{input_cols}{Input columns. String array.} 19 | 20 | \item{output_col}{Output column. String.} 21 | 22 | \item{dimension}{dimension to use for the embeddings} 23 | 24 | \item{uid}{A character string used to uniquely identify the ML estimator.} 25 | } 26 | \value{ 27 | The object returned depends on the class of \code{x}. 28 | 29 | \itemize{ 30 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 31 | a Spark \code{Estimator} object and can be used to compose 32 | \code{Pipeline} objects. 33 | 34 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 35 | the NLP estimator appended to the pipeline. 36 | 37 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 38 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 39 | } 40 | } 41 | \description{ 42 | Spark ML transformer that encodes text into high dimensional vectors that can be used for text classification, 43 | semantic similarity, clustering and other natural language tasks. 44 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#universalsentenceencoder} 45 | } 46 | -------------------------------------------------------------------------------- /man/nlp_univ_sent_encoder_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/univ_sent_encoder.R 3 | \name{nlp_univ_sent_encoder_pretrained} 4 | \alias{nlp_univ_sent_encoder_pretrained} 5 | \title{Load pretrained universal sentence encoder} 6 | \usage{ 7 | nlp_univ_sent_encoder_pretrained( 8 | sc, 9 | input_cols = NULL, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Loads pretrained universal sentence encoder into a Spark NLP annotator 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_version.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{nlp_version} 4 | \alias{nlp_version} 5 | \title{Spark NLP version} 6 | \usage{ 7 | nlp_version() 8 | } 9 | \value{ 10 | the version of the Spark NLP library in use 11 | } 12 | \description{ 13 | Spark NLP version 14 | } 15 | -------------------------------------------------------------------------------- /man/nlp_vivekn_sentiment_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/vivekn-sentiment-detector.R 3 | \name{nlp_vivekn_sentiment_pretrained} 4 | \alias{nlp_vivekn_sentiment_pretrained} 5 | \title{Load a pretrained Spark NLP model} 6 | \usage{ 7 | nlp_vivekn_sentiment_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{sc}{A Spark connection} 18 | 19 | \item{input_cols}{Input columns. String array.} 20 | 21 | \item{output_col}{Output column. String.} 22 | 23 | \item{name}{the name of the model to load. If NULL will use the default value} 24 | 25 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 26 | 27 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 28 | } 29 | \value{ 30 | The Spark NLP model with the pretrained model loaded 31 | } 32 | \description{ 33 | Create a pretrained Spark NLP \code{ViveknSentimentModel} model 34 | } 35 | -------------------------------------------------------------------------------- /man/nlp_word_embeddings_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/word-embeddings.R 3 | \name{nlp_word_embeddings_model} 4 | \alias{nlp_word_embeddings_model} 5 | \title{Create a Spark NLP WordEmbeddingsModel} 6 | \usage{ 7 | nlp_word_embeddings_model( 8 | sc, 9 | input_cols, 10 | output_col, 11 | storage_ref = NULL, 12 | dimension, 13 | case_sensitive = NULL, 14 | include_storage = NULL, 15 | lazy_annotator = NULL, 16 | read_cache_size = NULL, 17 | include_embeddings = NULL, 18 | uid = random_string("word_embeddings_") 19 | ) 20 | } 21 | \arguments{ 22 | \item{sc}{Spark connection} 23 | 24 | \item{input_cols}{Input columns. String array.} 25 | 26 | \item{output_col}{Output column. String.} 27 | 28 | \item{storage_ref}{binding to NerDLModel trained by that embeddings} 29 | 30 | \item{dimension}{number of word embeddings dimensions} 31 | 32 | \item{case_sensitive}{whether to ignore case in tokens for embeddings matching} 33 | 34 | \item{include_storage}{include the storage} 35 | 36 | \item{lazy_annotator}{boolean for laziness} 37 | 38 | \item{read_cache_size}{size for the read cache} 39 | 40 | \item{include_embeddings}{whether or not to include word embeddings when saving this annotator to disk (single or within pipeline)} 41 | 42 | \item{uid}{unique identifier for this instance} 43 | } 44 | \value{ 45 | a Spark transformer WordEmbeddingsModel 46 | } 47 | \description{ 48 | This function creates a WordEmbeddingsModel which uses the provided embeddings_ref. 49 | } 50 | -------------------------------------------------------------------------------- /man/nlp_word_embeddings_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/word-embeddings.R 3 | \name{nlp_word_embeddings_pretrained} 4 | \alias{nlp_word_embeddings_pretrained} 5 | \title{Load pretrained word embeddings} 6 | \usage{ 7 | nlp_word_embeddings_pretrained( 8 | sc, 9 | input_cols = NULL, 10 | output_col, 11 | name = NULL, 12 | lang = NULL, 13 | remote_loc = NULL, 14 | case_sensitive = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{sc}{A Spark connection} 19 | 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{name}{the name of the model to load. If NULL will use the default value} 25 | 26 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 27 | 28 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 29 | 30 | \item{case_sensitive}{whether to treat the words as case sensitive} 31 | } 32 | \value{ 33 | The Spark NLP model with the pretrained model loaded 34 | } 35 | \description{ 36 | Loads pretrained word embeddings into a Spark NLP annotator 37 | } 38 | -------------------------------------------------------------------------------- /man/nlp_xlm_roberta_embeddings_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xlm-roberta-embeddings.R 3 | \name{nlp_xlm_roberta_embeddings_pretrained} 4 | \alias{nlp_xlm_roberta_embeddings_pretrained} 5 | \title{Spark NLP XlmRoBertaEmbeddings} 6 | \usage{ 7 | nlp_xlm_roberta_embeddings_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | batch_size = NULL, 12 | case_sensitive = NULL, 13 | dimension = NULL, 14 | max_sentence_length = NULL, 15 | storage_ref = NULL, 16 | name = NULL, 17 | lang = NULL, 18 | remote_loc = NULL 19 | ) 20 | } 21 | \arguments{ 22 | \item{input_cols}{Input columns. String array.} 23 | 24 | \item{output_col}{Output column. String.} 25 | 26 | \item{batch_size}{Size of every batch (Default depends on model).} 27 | 28 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)} 29 | 30 | \item{dimension}{Number of embedding dimensions (Default depends on model)} 31 | 32 | \item{max_sentence_length}{Max sentence length to process (Default: 128)} 33 | 34 | \item{storage_ref}{Unique identifier for storage (Default: this.uid)} 35 | 36 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 37 | 38 | \item{uid}{A character string used to uniquely identify the ML estimator.} 39 | } 40 | \value{ 41 | The object returned depends on the class of \code{x}. 42 | 43 | \itemize{ 44 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 45 | a Spark \code{Estimator} object and can be used to compose 46 | \code{Pipeline} objects. 47 | 48 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 49 | the NLP estimator appended to the pipeline. 50 | 51 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 52 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 53 | } 54 | } 55 | \description{ 56 | Spark ML transformer that 57 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#xmlrobertaembeddings} 58 | } 59 | -------------------------------------------------------------------------------- /man/nlp_xlm_roberta_sentence_embeddings_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xlm_roberta_sentence_embeddings.R 3 | \name{nlp_xlm_roberta_sentence_embeddings_pretrained} 4 | \alias{nlp_xlm_roberta_sentence_embeddings_pretrained} 5 | \title{Load a pretrained Spark NLP XlmRoBertaSentenceEmbeddings model} 6 | \usage{ 7 | nlp_xlm_roberta_sentence_embeddings_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | case_sensitive = NULL, 12 | batch_size = NULL, 13 | dimension = NULL, 14 | max_sentence_length = NULL, 15 | name = NULL, 16 | lang = NULL, 17 | remote_loc = NULL 18 | ) 19 | } 20 | \arguments{ 21 | \item{sc}{A Spark connection} 22 | 23 | \item{input_cols}{Input columns. String array.} 24 | 25 | \item{output_col}{Output column. String.} 26 | 27 | \item{case_sensitive}{whether to lowercase tokens or not} 28 | 29 | \item{batch_size}{batch size} 30 | 31 | \item{dimension}{defines the output layer of BERT when calculating embeddings} 32 | 33 | \item{max_sentence_length}{max sentence length to process} 34 | 35 | \item{name}{the name of the model to load. If NULL will use the default value} 36 | 37 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 38 | 39 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 40 | } 41 | \value{ 42 | The Spark NLP model with the pretrained model loaded 43 | } 44 | \description{ 45 | Create a pretrained Spark NLP \code{XlmRoBertaSentenceEmbeddings} model. 46 | See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#xlmrobertasentenceembeddings} 47 | } 48 | \details{ 49 | Sentence-level embeddings using XLM-RoBERTa. The XLM-RoBERTa model was proposed in 50 | Unsupervised Cross-lingual Representation Learning at Scale by Alexis Conneau, 51 | Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, 52 | Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on 53 | Facebook's RoBERTa model released in 2019. It is a large multi-lingual language model, 54 | trained on 2.5TB of filtered CommonCrawl data. 55 | } 56 | -------------------------------------------------------------------------------- /man/nlp_xlnet_embeddings_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xlnet-embeddings.R 3 | \name{nlp_xlnet_embeddings_pretrained} 4 | \alias{nlp_xlnet_embeddings_pretrained} 5 | \title{Load a pretrained Spark NLP XlnetEmbeddings model} 6 | \usage{ 7 | nlp_xlnet_embeddings_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | case_sensitive = NULL, 12 | batch_size = NULL, 13 | dimension = NULL, 14 | lazy_annotator = NULL, 15 | max_sentence_length = NULL, 16 | storage_ref = NULL, 17 | name = NULL, 18 | lang = NULL, 19 | remote_loc = NULL 20 | ) 21 | } 22 | \arguments{ 23 | \item{sc}{A Spark connection} 24 | 25 | \item{input_cols}{Input columns. String array.} 26 | 27 | \item{output_col}{Output column. String.} 28 | 29 | \item{case_sensitive}{whether to treat the tokens as case insensitive when looking up their embedding} 30 | 31 | \item{batch_size}{batch size} 32 | 33 | \item{dimension}{the embedding dimension} 34 | 35 | \item{lazy_annotator}{use as a lazy annotator or not} 36 | 37 | \item{max_sentence_length}{set the maximum sentence length} 38 | 39 | \item{storage_ref}{storage reference name} 40 | 41 | \item{name}{the name of the model to load. If NULL will use the default value} 42 | 43 | \item{lang}{the language of the model to be loaded. If NULL will use the default value} 44 | 45 | \item{remote_loc}{the remote location of the model. If NULL will use the default value} 46 | } 47 | \value{ 48 | The Spark NLP model with the pretrained model loaded 49 | } 50 | \description{ 51 | Create a pretrained Spark NLP \code{XlnetEmbeddings} model 52 | } 53 | -------------------------------------------------------------------------------- /man/nlp_xlnet_token_classification_pretrained.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xlnet-for-token-classification.R 3 | \name{nlp_xlnet_token_classification_pretrained} 4 | \alias{nlp_xlnet_token_classification_pretrained} 5 | \title{Spark NLP XlnetForTokenClassification} 6 | \usage{ 7 | nlp_xlnet_token_classification_pretrained( 8 | sc, 9 | input_cols, 10 | output_col, 11 | batch_size = NULL, 12 | case_sensitive = NULL, 13 | max_sentence_length = NULL, 14 | name = NULL, 15 | lang = NULL, 16 | remote_loc = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{input_cols}{Input columns. String array.} 21 | 22 | \item{output_col}{Output column. String.} 23 | 24 | \item{batch_size}{Size of every batch (Default depends on model).} 25 | 26 | \item{case_sensitive}{Whether to ignore case in index lookups (Default depends on model)} 27 | 28 | \item{max_sentence_length}{Max sentence length to process (Default: 128)} 29 | 30 | \item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.} 31 | 32 | \item{uid}{A character string used to uniquely identify the ML estimator.} 33 | } 34 | \value{ 35 | The object returned depends on the class of \code{x}. 36 | 37 | \itemize{ 38 | \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_estimator} object. The object contains a pointer to 39 | a Spark \code{Estimator} object and can be used to compose 40 | \code{Pipeline} objects. 41 | 42 | \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with 43 | the NLP estimator appended to the pipeline. 44 | 45 | \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, an estimator is constructed then 46 | immediately fit with the input \code{tbl_spark}, returning an NLP model. 47 | } 48 | } 49 | \description{ 50 | XlnetForTokenClassification can load XLNet Models with a token classification head on top 51 | (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. 52 | See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#xlnetfortokenclassification} 53 | } 54 | -------------------------------------------------------------------------------- /man/set_nlp_version.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{set_nlp_version} 4 | \alias{set_nlp_version} 5 | \title{Set the version of the Spark NLP library to use} 6 | \usage{ 7 | set_nlp_version(version) 8 | } 9 | \arguments{ 10 | \item{version}{Spark NLP version number to use when starting Spark Session} 11 | } 12 | \description{ 13 | Set the version of the Spark NLP library to use 14 | } 15 | -------------------------------------------------------------------------------- /sparknlp.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | PackageRoxygenize: rd,collate,namespace 19 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(sparklyr) 3 | library(sparknlp) 4 | 5 | if (identical(Sys.getenv("NOT_CRAN"), "true")) { 6 | test_check("sparknlp") 7 | on.exit({spark_disconnect_all()}) 8 | } 9 | 10 | -------------------------------------------------------------------------------- /tests/testthat/.gitignore: -------------------------------------------------------------------------------- 1 | metastore_db 2 | -------------------------------------------------------------------------------- /tests/testthat/data/.gitignore: -------------------------------------------------------------------------------- 1 | dist.psv 2 | .result.conll.crc 3 | result.conll 4 | -------------------------------------------------------------------------------- /tests/testthat/data/dependency_treebank/wsj_0001.dp: -------------------------------------------------------------------------------- 1 | Pierre NNP 2 2 | Vinken NNP 8 3 | , , 2 4 | 61 CD 5 5 | years NNS 6 6 | old JJ 2 7 | , , 2 8 | will MD 0 9 | join VB 8 10 | the DT 11 11 | board NN 9 12 | as IN 9 13 | a DT 15 14 | nonexecutive JJ 15 15 | director NN 12 16 | Nov. NNP 9 17 | 29 CD 16 18 | . . 8 19 | 20 | Mr. NNP 2 21 | Vinken NNP 3 22 | is VBZ 0 23 | chairman NN 3 24 | of IN 4 25 | Elsevier NNP 7 26 | N.V. NNP 12 27 | , , 12 28 | the DT 12 29 | Dutch NNP 12 30 | publishing VBG 12 31 | group NN 5 32 | . . 3 33 | -------------------------------------------------------------------------------- /tests/testthat/data/en.test.conllu: -------------------------------------------------------------------------------- 1 | # newdoc id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200 2 | # sent_id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-0001 3 | # newpar id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200-p0001 4 | # text = What if Google Morphed Into GoogleOS? 5 | 1 What what PRON WP PronType=Int 0 root 0:root _ 6 | 2 if if SCONJ IN _ 4 mark 4:mark _ 7 | 3 Google Google PROPN NNP Number=Sing 4 nsubj 4:nsubj _ 8 | 4 Morphed morph VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 1 advcl 1:advcl:if _ 9 | 5 Into into ADP IN _ 6 case 6:case _ 10 | 6 GoogleOS GoogleOS PROPN NNP Number=Sing 4 obl 4:obl:into SpaceAfter=No 11 | 7 ? ? PUNCT . _ 4 punct 4:punct _ 12 | 13 | # newdoc id = weblog-blogspot.com_zentelligence_20040423000200_ENG_20040423_000200 14 | # sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0003 15 | # text = Google is a nice search engine. 16 | 1 Google Google PROPN NNP Number=Sing 6 nsubj 6:nsubj _ 17 | 2 is be AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 6 cop 6:cop _ 18 | 3 a a DET DT Definite=Ind|PronType=Art 6 det 6:det _ 19 | 4 nice nice ADJ JJ Degree=Pos 6 amod 6:amod _ 20 | 5 search search NOUN NN Number=Sing 6 compound 6:compound _ 21 | 6 engine engine NOUN NN Number=Sing 0 root 0:root SpaceAfter=No 22 | 7 . . PUNCT . _ 6 punct 6:punct _ 23 | 24 | # sent_id = weblog-blogspot.com_marketview_20050511222700_ENG_20050511_222700-0004 25 | # text = Does anybody use it for anything else? 26 | 1 Does do AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 aux 3:aux _ 27 | 2 anybody anybody PRON NN Number=Sing 3 nsubj 3:nsubj _ 28 | 3 use use VERB VB VerbForm=Inf 0 root 0:root _ 29 | 4 it it PRON PRP Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs 3 obj 3:obj _ 30 | 5 for for ADP IN _ 6 case 6:case _ 31 | 6 anything anything PRON NN Number=Sing 3 obl 3:obl:for _ 32 | 7 else else ADJ JJ Degree=Pos 6 amod 6:amod SpaceAfter=No 33 | 8 ? ? PUNCT . _ 3 punct 3:punct _ -------------------------------------------------------------------------------- /tests/testthat/data/entities.txt: -------------------------------------------------------------------------------- 1 | i think 2 | Feeling strangely 3 | guitar lessons -------------------------------------------------------------------------------- /tests/testthat/data/entity_ruler/patterns.csv: -------------------------------------------------------------------------------- 1 | PERSON|Jon 2 | PERSON|John 3 | PERSON|John Snow 4 | LOCATION|Winterfell -------------------------------------------------------------------------------- /tests/testthat/data/gender.csv: -------------------------------------------------------------------------------- 1 | male,man,male,boy,gentleman,he,him 2 | female,woman,female,girl,lady,old-lady,she,her 3 | neutral,neutral -------------------------------------------------------------------------------- /tests/testthat/data/gender.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": "Gender", 3 | "ruleScope": "sentence", 4 | "completeMatchRegex": "true" 5 | } 6 | -------------------------------------------------------------------------------- /tests/testthat/data/pos_corpus.txt: -------------------------------------------------------------------------------- 1 | the|DT cats|NNS are|VBP laying|VBG in|IN front|JJ of|IN the|DT fireplace|NN .|. the|DT dogs|NNS are|VBP 2 | staying|VBG cool|NN in|IN the|DT kitchen|NN .|. 3 | -------------------------------------------------------------------------------- /tests/testthat/data/re_train.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/data/re_train.parquet -------------------------------------------------------------------------------- /tests/testthat/data/regex_match.txt: -------------------------------------------------------------------------------- 1 | the\\s\\w+, followed by ‘the' 2 | -------------------------------------------------------------------------------- /tests/testthat/data/sentiment.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | This movie is the best movie I have wached ever! In my opinion this movie can win an award.,0 3 | This was a terrible movie! The acting was bad really bad!,1 -------------------------------------------------------------------------------- /tests/testthat/data/sentiment.parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /tests/testthat/data/sentiment.parquet/.part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/data/sentiment.parquet/.part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /tests/testthat/data/sentiment.parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/data/sentiment.parquet/_SUCCESS -------------------------------------------------------------------------------- /tests/testthat/data/sentiment.parquet/part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/data/sentiment.parquet/part-00000-f52ab1ca-1b8e-4b36-b52e-6041abb05345-c000.snappy.parquet -------------------------------------------------------------------------------- /tests/testthat/data/sentiment_dictionary.txt: -------------------------------------------------------------------------------- 1 | superb,positive 2 | bad,negative 3 | lack of, revert 4 | very,increment 5 | barely,decrement 6 | -------------------------------------------------------------------------------- /tests/testthat/data/words.txt: -------------------------------------------------------------------------------- 1 | abacay 2 | abacas 3 | abacate 4 | abacaxi 5 | abaci 6 | abacinate 7 | abacination 8 | abacisci 9 | abaciscus 10 | abacist 11 | aback 12 | abacli 13 | Abaco 14 | abacot 15 | abacterial 16 | abactinal 17 | abactinally 18 | abaction 19 | abactor 20 | abaculi 21 | abaculus 22 | abacus 23 | abacuses 24 | Abad 25 | abada 26 | Abadan 27 | Abaddon 28 | abadejo 29 | abadengo 30 | abadia 31 | Abadite 32 | abaff 33 | abaft 34 | Abagael 35 | Abagail 36 | Abagtha 37 | abay 38 | abayah 39 | Abailard 40 | abaisance 41 | abaised 42 | abaiser 43 | abaisse 44 | abaissed 45 | abaka 46 | Abakan 47 | abakas 48 | Abakumov 49 | abalation 50 | abalienate 51 | abalienated 52 | abalienating 53 | abalienation 54 | abalone 55 | abalones 56 | Abama 57 | abamp 58 | abampere 59 | abamperes 60 | abamps 61 | Abana -------------------------------------------------------------------------------- /tests/testthat/testthat-albert-embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | # config <- spark_config() 4 | # config$`sparklyr.shell.driver-memory` <- "8G" 5 | # sc <- spark_connect(master = "local", version = "2.4.3", config = config) 6 | text_tbl <- testthat_tbl("test_text") 7 | 8 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 9 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 10 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 11 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 12 | 13 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 14 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 15 | 16 | assign("sc", sc, envir = parent.frame()) 17 | assign("pipeline", pipeline, envir = parent.frame()) 18 | assign("test_data", test_data, envir = parent.frame()) 19 | }) 20 | 21 | teardown({ 22 | spark_disconnect(sc) 23 | rm(sc, envir = .GlobalEnv) 24 | rm(pipeline, envir = .GlobalEnv) 25 | rm(test_data, envir = .GlobalEnv) 26 | }) 27 | 28 | test_that("nlp_albert_embeddings pretrained", { 29 | model <- nlp_albert_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "albert") 30 | transformed_data <- ml_transform(model, test_data) 31 | expect_true("albert" %in% colnames(transformed_data)) 32 | }) 33 | 34 | test_that("nlp_albert_embeddings load", { 35 | model_files <- list.files("~/cache_pretrained/") 36 | albert_model_file <- max(Filter(function(s) startsWith(s, "albert_base"), model_files)) 37 | model <- ml_load(sc, paste0("~/cache_pretrained/", albert_model_file)) 38 | transformed_data <- ml_transform(model, test_data) 39 | expect_true("albert" %in% colnames(transformed_data)) 40 | }) 41 | -------------------------------------------------------------------------------- /tests/testthat/testthat-albert-for-token-classification.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | rm(sc, envir = .GlobalEnv) 20 | rm(pipeline, envir = .GlobalEnv) 21 | rm(test_data, envir = .GlobalEnv) 22 | }) 23 | 24 | test_that("nlp_albert_token_classification pretrained", { 25 | model <- nlp_albert_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert") 26 | transformed_data <- ml_transform(model, test_data) 27 | expect_true("bert" %in% colnames(transformed_data)) 28 | }) 29 | 30 | test_that("nlp_albert_token_classification load", { 31 | model_files <- list.files("~/cache_pretrained/") 32 | bert_model_file <- max(Filter(function(s) startsWith(s, "albert_base_token"), model_files)) 33 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 34 | model <- nlp_set_output_col(model, "label") 35 | transformed_data <- ml_transform(model, test_data) 36 | expect_true("label" %in% colnames(transformed_data)) 37 | }) -------------------------------------------------------------------------------- /tests/testthat/testthat-annotation_tool_json_reader.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | assign("sc", sc, envir = parent.frame()) 4 | 5 | if (file.exists(here::here("tests", "testthat", "data", "result.conll"))) { 6 | file.remove(here::here("tests", "testthat", "data", "result.conll")) 7 | } 8 | }) 9 | 10 | teardown({ 11 | rm(sc, envir = .GlobalEnv) 12 | }) 13 | 14 | test_that("nlp_generate_assertion_read_dataset", { 15 | train_data_file <- here::here("tests", "testthat", "data", "result.json") 16 | reader <- nlp_annotation_tool_json_reader(sc) 17 | mydf <- nlp_annotation_read_dataset(reader, train_data_file) 18 | 19 | expect_true("ner_label" %in% colnames(mydf)) 20 | }) 21 | 22 | test_that("nlp_generate_assertion_train_set", { 23 | train_data_file <- here::here("tests", "testthat", "data", "result.json") 24 | reader <- nlp_annotation_tool_json_reader(sc) 25 | mydf <- nlp_annotation_read_dataset(reader, train_data_file) 26 | train_df <- nlp_generate_assertion_train_set(reader, mydf) 27 | 28 | expect_true("target" %in% colnames(train_df)) 29 | }) 30 | 31 | test_that("nlp_generate_plain_assertion_train_set", { 32 | train_data_file <- here::here("tests", "testthat", "data", "result.json") 33 | reader <- nlp_annotation_tool_json_reader(sc) 34 | mydf <- nlp_annotation_read_dataset(reader, train_data_file) 35 | train_df <- nlp_generate_plain_assertion_train_set(reader, mydf) 36 | 37 | expect_true("assertion" %in% colnames(train_df)) 38 | }) 39 | 40 | test_that("nlp_generate_colln", { 41 | train_data_file <- here::here("tests", "testthat", "data", "result.json") 42 | reader <- nlp_annotation_tool_json_reader(sc) 43 | mydf <- nlp_annotation_read_dataset(reader, train_data_file) 44 | train_df <- nlp_generate_colln(reader, mydf, here::here("tests", "testthat", "data", "result.conll")) 45 | 46 | expect_true(file.exists(here::here("tests", "testthat", "data", "result.conll"))) 47 | }) 48 | -------------------------------------------------------------------------------- /tests/testthat/testthat-bert-embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | # config <- spark_config() 4 | # config$`sparklyr.shell.driver-memory` <- "8G" 5 | # sc <- spark_connect(master = "local", version = "2.4.3", config = config) 6 | text_tbl <- testthat_tbl("test_text") 7 | 8 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 9 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 10 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 11 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 12 | 13 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 14 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 15 | 16 | assign("sc", sc, envir = parent.frame()) 17 | assign("pipeline", pipeline, envir = parent.frame()) 18 | assign("test_data", test_data, envir = parent.frame()) 19 | }) 20 | 21 | teardown({ 22 | spark_disconnect(sc) 23 | rm(sc, envir = .GlobalEnv) 24 | rm(pipeline, envir = .GlobalEnv) 25 | rm(test_data, envir = .GlobalEnv) 26 | }) 27 | 28 | test_that("nlp_bert_embeddings pretrained", { 29 | model <- nlp_bert_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert") 30 | transformed_data <- ml_transform(model, test_data) 31 | expect_true("bert" %in% colnames(transformed_data)) 32 | }) 33 | 34 | test_that("nlp_bert_embeddings load", { 35 | model_files <- list.files("~/cache_pretrained/") 36 | bert_model_file <- max(Filter(function(s) startsWith(s, "small_bert"), model_files)) 37 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 38 | transformed_data <- ml_transform(model, test_data) 39 | expect_true("bert" %in% colnames(transformed_data)) 40 | }) 41 | -------------------------------------------------------------------------------- /tests/testthat/testthat-bert-for-token-classification.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | rm(sc, envir = .GlobalEnv) 20 | rm(pipeline, envir = .GlobalEnv) 21 | rm(test_data, envir = .GlobalEnv) 22 | }) 23 | 24 | test_that("nlp_bert_token_classification pretrained", { 25 | model <- nlp_bert_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert") 26 | transformed_data <- ml_transform(model, test_data) 27 | expect_true("bert" %in% colnames(transformed_data)) 28 | }) 29 | 30 | test_that("nlp_bert_token_classification load", { 31 | model_files <- list.files("~/cache_pretrained/") 32 | bert_model_file <- max(Filter(function(s) startsWith(s, "bert_base_token"), model_files)) 33 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 34 | model <- nlp_set_output_col(model, "label") 35 | transformed_data <- ml_transform(model, test_data) 36 | expect_true("label" %in% colnames(transformed_data)) 37 | }) -------------------------------------------------------------------------------- /tests/testthat/testthat-bert_sentence_chunk_embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentence <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | word_embeddings <- nlp_bert_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "word_embeddings", 10 | name = "biobert_pubmed_base_cased") 11 | ner_model <- nlp_medical_ner_pretrained(sc, input_cols = c("sentence", "token", "word_embeddings"), output_col = "ner", 12 | name = "ner_clinical_biobert", remote_loc = "clinical/models") 13 | ner_converter <- nlp_ner_converter(sc, input_cols = c("sentence", "token", "ner"), output_col = "ner_chunk") 14 | 15 | pipeline <- ml_pipeline(assembler, sentence, tokenizer, word_embeddings, ner_model, ner_converter) 16 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 17 | 18 | assign("sc", sc, envir = parent.frame()) 19 | assign("pipeline", pipeline, envir = parent.frame()) 20 | assign("test_data", test_data, envir = parent.frame()) 21 | }) 22 | 23 | teardown({ 24 | spark_disconnect(sc) 25 | rm(sc, envir = .GlobalEnv) 26 | rm(pipeline, envir = .GlobalEnv) 27 | rm(test_data, envir = .GlobalEnv) 28 | }) 29 | 30 | test_that("nlp_bert_sentence_embeddings pretrained", { 31 | model <- nlp_bert_sentence_chunk_embeddings_pretrained(sc, input_cols = c("sentence", "ner_chunk"), output_col = "bert_sentence_chunk_embeddings") 32 | transformed_data <- ml_transform(model, test_data) 33 | expect_true("bert_sentence_chunk_embeddings" %in% colnames(transformed_data)) 34 | 35 | expect_true(inherits(model, "nlp_bert_sentence_chunk_embeddings")) 36 | }) 37 | 38 | -------------------------------------------------------------------------------- /tests/testthat/testthat-bert_sentence_embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | 8 | pipeline <- ml_pipeline(assembler) 9 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 10 | 11 | assign("sc", sc, envir = parent.frame()) 12 | assign("pipeline", pipeline, envir = parent.frame()) 13 | assign("test_data", test_data, envir = parent.frame()) 14 | }) 15 | 16 | teardown({ 17 | spark_disconnect(sc) 18 | rm(sc, envir = .GlobalEnv) 19 | rm(pipeline, envir = .GlobalEnv) 20 | rm(test_data, envir = .GlobalEnv) 21 | }) 22 | 23 | test_that("nlp_bert_sentence_embeddings pretrained", { 24 | model <- nlp_bert_sentence_embeddings_pretrained(sc, input_cols = c("document"), output_col = "bert_sentence_embeddings") 25 | transformed_data <- ml_transform(model, test_data) 26 | expect_true("bert_sentence_embeddings" %in% colnames(transformed_data)) 27 | 28 | expect_true(inherits(model, "nlp_bert_sentence_embeddings")) 29 | }) 30 | 31 | -------------------------------------------------------------------------------- /tests/testthat/testthat-chunk2token.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | ngram <- nlp_ngram_generator(sc, input_cols = c("token"), output_col = "ngram", n = 2) 10 | 11 | 12 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer, ngram) 13 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 14 | 15 | assign("sc", sc, envir = parent.frame()) 16 | assign("pipeline", pipeline, envir = parent.frame()) 17 | assign("test_data", test_data, envir = parent.frame()) 18 | }) 19 | 20 | teardown({ 21 | rm(sc, envir = .GlobalEnv) 22 | rm(pipeline, envir = .GlobalEnv) 23 | rm(test_data, envir = .GlobalEnv) 24 | }) 25 | 26 | test_that("chunk2token param setting", { 27 | test_args <- list( 28 | input_cols = c("string1"), 29 | output_col = "string1" 30 | ) 31 | 32 | test_param_setting(sc, nlp_chunk2token, test_args) 33 | }) 34 | 35 | test_that("nlp_chunk2token spark_connection", { 36 | test_annotator <- nlp_chunk2token(sc, input_cols = c("ngram"), output_col = "token_chunk") 37 | transformed_data <- ml_transform(test_annotator, test_data) 38 | expect_true("token_chunk" %in% colnames(transformed_data)) 39 | 40 | expect_true(inherits(test_annotator, "nlp_chunk2token")) 41 | }) 42 | 43 | test_that("nlp_chunk2token ml_pipeline", { 44 | test_annotator <- nlp_chunk2token(pipeline, input_cols = c("ngram"), output_col = "token_chunk") 45 | transformed_data <- ml_fit_and_transform(test_annotator, test_data) 46 | expect_true("token_chunk" %in% colnames(transformed_data)) 47 | }) 48 | 49 | test_that("nlp_chunk2token tbl_spark", { 50 | transformed_data <- nlp_chunk2token(test_data, input_cols = c("ngram"), output_col = "token_chunk") 51 | expect_true("token_chunk" %in% colnames(transformed_data)) 52 | }) 53 | 54 | -------------------------------------------------------------------------------- /tests/testthat/testthat-chunker.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | pos <- nlp_perceptron_pretrained(sc, input_cols = c("sentence", "token"), output_col = "pos") 10 | 11 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer, pos) 12 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 13 | 14 | assign("sc", sc, envir = parent.frame()) 15 | assign("pipeline", pipeline, envir = parent.frame()) 16 | assign("test_data", test_data, envir = parent.frame()) 17 | }) 18 | 19 | teardown({ 20 | spark_disconnect(sc) 21 | rm(sc, envir = .GlobalEnv) 22 | rm(pipeline, envir = .GlobalEnv) 23 | rm(test_data, envir = .GlobalEnv) 24 | }) 25 | 26 | test_that("nlp_chunker param setting", { 27 | test_args <- list( 28 | input_cols = c("sentence", "pos"), 29 | output_col = "chunk", 30 | regex_parsers = c("
?*", "+") 31 | ) 32 | 33 | test_param_setting(sc, nlp_chunker, test_args) 34 | }) 35 | 36 | test_that("nlp_nlp_chunker spark_connection", { 37 | test_annotator <- nlp_chunker(sc, input_cols = c("sentence","pos"), output_col = "chunk") 38 | transformed_data <- ml_transform(test_annotator, test_data) 39 | expect_true("chunk" %in% colnames(transformed_data)) 40 | }) 41 | 42 | test_that("nlp_nlp_chunker ml_pipeline", { 43 | test_annotator <- nlp_chunker(pipeline, input_cols = c("sentence","pos"), output_col = "chunk") 44 | transformed_data <- ml_fit_and_transform(test_annotator, test_data) 45 | expect_true("chunk" %in% colnames(transformed_data)) 46 | }) 47 | 48 | test_that("nlp_nlp_chunker tbl_spark", { 49 | transformed_data <- nlp_chunker(test_data, input_cols = c("sentence","pos"), output_col = "chunk") 50 | expect_true("chunk" %in% colnames(transformed_data)) 51 | }) 52 | 53 | -------------------------------------------------------------------------------- /tests/testthat/testthat-distilbert-embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | rm(sc, envir = .GlobalEnv) 20 | rm(pipeline, envir = .GlobalEnv) 21 | rm(test_data, envir = .GlobalEnv) 22 | }) 23 | 24 | test_that("nlp_distilbert_embeddings pretrained", { 25 | model <- nlp_distilbert_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "distilbert") 26 | transformed_data <- ml_transform(model, test_data) 27 | expect_true("distilbert" %in% colnames(transformed_data)) 28 | }) 29 | 30 | test_that("nlp_distilbert_embeddings load", { 31 | model_files <- list.files("~/cache_pretrained/") 32 | bert_model_file <- max(Filter(function(s) startsWith(s, "distilbert_base"), model_files)) 33 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 34 | transformed_data <- ml_transform(model, test_data) 35 | expect_true("embeddings" %in% colnames(transformed_data)) 36 | }) 37 | -------------------------------------------------------------------------------- /tests/testthat/testthat-distilbert-for-token-classification.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("document"), output_col = "token") 9 | # TODO: put other annotators here as needed 10 | 11 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 12 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 13 | 14 | assign("sc", sc, envir = parent.frame()) 15 | assign("pipeline", pipeline, envir = parent.frame()) 16 | assign("test_data", test_data, envir = parent.frame()) 17 | }) 18 | 19 | teardown({ 20 | rm(sc, envir = .GlobalEnv) 21 | rm(pipeline, envir = .GlobalEnv) 22 | rm(test_data, envir = .GlobalEnv) 23 | }) 24 | 25 | 26 | test_that("nlp_distilbert_token_classification pretrained", { 27 | model <- nlp_distilbert_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "distilbert") 28 | transformed_data <- ml_transform(model, test_data) 29 | expect_true("distilbert" %in% colnames(transformed_data)) 30 | }) 31 | 32 | test_that("nlp_distilbert_token_classification load", { 33 | model_files <- list.files("~/cache_pretrained/") 34 | bert_model_file <- max(Filter(function(s) startsWith(s, "distilbert_base_token"), model_files)) 35 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 36 | model <- nlp_set_output_col(model, "label") 37 | transformed_data <- ml_transform(model, test_data) 38 | expect_true("label" %in% colnames(transformed_data)) 39 | }) 40 | -------------------------------------------------------------------------------- /tests/testthat/testthat-document-assembler.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | assign("sc", sc, envir = parent.frame()) 5 | assign("text_tbl", text_tbl, envir = parent.frame()) 6 | }) 7 | 8 | teardown({ 9 | spark_disconnect(sc) 10 | rm(sc, envir = .GlobalEnv) 11 | rm(text_tbl, envir = .GlobalEnv) 12 | }) 13 | 14 | test_that("nlp_document_assembler() param setting", { 15 | test_args <- list( 16 | input_col = "text", 17 | output_col = "document", 18 | id_col = "rowkey", 19 | metadata_col = "met", 20 | cleanup_mode = "shrink") 21 | test_param_setting(sc, nlp_document_assembler, test_args) 22 | }) 23 | 24 | test_that("nlp_document_assembler() spark_connection", { 25 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 26 | transformed_data <- ml_transform(assembler, text_tbl) 27 | 28 | expect_true("document" %in% colnames(transformed_data)) 29 | }) 30 | 31 | test_that("nlp_document_assembler() ml_pipeline", { 32 | pipeline <- ml_pipeline(sc) 33 | assembler <- nlp_document_assembler(pipeline, input_col = "text", output_col = "document") 34 | 35 | transformed_data <- ml_fit_and_transform(assembler, text_tbl) 36 | 37 | expect_true("document" %in% colnames(transformed_data)) 38 | }) 39 | 40 | test_that("nlp_document_assembler() tbl_spark", { 41 | transformed_data <- nlp_document_assembler(text_tbl, input_col = "text", output_col = "document") 42 | expect_true("document" %in% colnames(transformed_data)) 43 | }) 44 | -------------------------------------------------------------------------------- /tests/testthat/testthat-drug_normalizer.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | 8 | pipeline <- ml_pipeline(assembler) 9 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 10 | 11 | assign("sc", sc, envir = parent.frame()) 12 | assign("pipeline", pipeline, envir = parent.frame()) 13 | assign("test_data", test_data, envir = parent.frame()) 14 | }) 15 | 16 | teardown({ 17 | rm(sc, envir = .GlobalEnv) 18 | rm(pipeline, envir = .GlobalEnv) 19 | rm(test_data, envir = .GlobalEnv) 20 | }) 21 | 22 | test_that("drug_normalizer param setting", { 23 | test_args <- list( 24 | input_cols = c("string1"), 25 | output_col = "string1", 26 | lower_case = TRUE, 27 | policy = "string1" 28 | ) 29 | 30 | test_param_setting(sc, nlp_drug_normalizer, test_args) 31 | }) 32 | 33 | test_that("nlp_drug_normalizer spark_connection", { 34 | test_annotator <- nlp_drug_normalizer(sc, input_cols = c("document"), output_col = "document_normalized") 35 | transformed_data <- ml_transform(test_annotator, test_data) 36 | expect_true("document_normalized" %in% colnames(transformed_data)) 37 | expect_true(inherits(test_annotator, "nlp_drug_normalizer")) 38 | }) 39 | 40 | test_that("nlp_drug_normalizer ml_pipeline", { 41 | test_annotator <- nlp_drug_normalizer(pipeline, input_cols = c("document"), output_col = "document_normalized") 42 | transformed_data <- ml_fit_and_transform(test_annotator, test_data) 43 | expect_true("document_normalized" %in% colnames(transformed_data)) 44 | }) 45 | 46 | test_that("nlp_drug_normalizer tbl_spark", { 47 | transformed_data <- nlp_drug_normalizer(test_data, input_cols = c("document"), output_col = "document_normalized") 48 | expect_true("document_normalized" %in% colnames(transformed_data)) 49 | }) 50 | 51 | -------------------------------------------------------------------------------- /tests/testthat/testthat-elmo-embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 6 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 7 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 8 | 9 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 10 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 11 | 12 | assign("sc", sc, envir = parent.frame()) 13 | assign("pipeline", pipeline, envir = parent.frame()) 14 | assign("test_data", test_data, envir = parent.frame()) 15 | }) 16 | 17 | teardown({ 18 | spark_disconnect(sc) 19 | rm(sc, envir = .GlobalEnv) 20 | rm(pipeline, envir = .GlobalEnv) 21 | rm(test_data, envir = .GlobalEnv) 22 | }) 23 | 24 | test_that("nlp_elmo_embeddings pretrained", { 25 | model <- nlp_elmo_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "elmo") 26 | transformed_data <- ml_transform(model, test_data) 27 | expect_true("elmo" %in% colnames(transformed_data)) 28 | }) 29 | 30 | test_that("nlp_elmo_embeddings load", { 31 | model_files <- list.files("~/cache_pretrained/") 32 | elmo_model_file <- max(Filter(function(s) startsWith(s, "elmo_"), model_files)) 33 | model <- ml_load(sc, paste0("~/cache_pretrained/", elmo_model_file)) 34 | transformed_data <- ml_transform(model, test_data) 35 | expect_true("elmo" %in% colnames(transformed_data)) 36 | }) 37 | -------------------------------------------------------------------------------- /tests/testthat/testthat-finisher.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | spark_disconnect(sc) 20 | rm(sc, envir = .GlobalEnv) 21 | rm(pipeline, envir = .GlobalEnv) 22 | rm(test_data, envir = .GlobalEnv) 23 | }) 24 | 25 | test_that("finisher param setting", { 26 | test_args <- list( 27 | input_cols = c("token"), 28 | output_cols = c("finisher_token"), 29 | clean_annotations = TRUE, 30 | value_split_symbol = "#", 31 | annotation_split_symbol = "@", 32 | include_metadata = TRUE, 33 | output_as_array = FALSE 34 | ) 35 | 36 | test_param_setting(sc, nlp_finisher, test_args) 37 | }) 38 | 39 | test_that("nlp_finisher spark_connection", { 40 | test_annotator <- nlp_finisher(sc, input_cols = c("token")) 41 | transformed_data <- ml_transform(test_annotator, test_data) 42 | expect_true("finished_token" %in% colnames(transformed_data)) 43 | }) 44 | 45 | test_that("nlp_finisher ml_pipeline", { 46 | test_annotator <- nlp_finisher(sc, input_cols = c("token")) 47 | transformed_data <- ml_transform(test_annotator, test_data) 48 | expect_true("finished_token" %in% colnames(transformed_data)) 49 | }) 50 | 51 | test_that("nlp_finisher tbl_spark", { 52 | transformed_data <- nlp_finisher(test_data, input_cols = c("token")) 53 | expect_true("finished_token" %in% colnames(transformed_data)) 54 | }) 55 | 56 | -------------------------------------------------------------------------------- /tests/testthat/testthat-language-detector-dl.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | 8 | pipeline <- ml_pipeline(assembler) 9 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 10 | 11 | assign("sc", sc, envir = parent.frame()) 12 | assign("pipeline", pipeline, envir = parent.frame()) 13 | assign("test_data", test_data, envir = parent.frame()) 14 | }) 15 | 16 | teardown({ 17 | spark_disconnect(sc) 18 | rm(sc, envir = .GlobalEnv) 19 | rm(pipeline, envir = .GlobalEnv) 20 | rm(test_data, envir = .GlobalEnv) 21 | }) 22 | 23 | test_that("nlp_language_detector pretrained", { 24 | model <- nlp_language_detector_dl_pretrained(sc, input_cols = c("document"), output_col = "language", threshold = 0.2) 25 | transformed_data <- ml_transform(model, test_data) 26 | expect_true("language" %in% colnames(transformed_data)) 27 | 28 | # Test Float parameters 29 | oldvalue <- ml_param(model, "threshold") 30 | newmodel <- nlp_set_param(model, "threshold", 0.8) 31 | newvalue <- ml_param(newmodel, "threshold") 32 | 33 | expect_equal(newvalue, 0.8) 34 | }) 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /tests/testthat/testthat-light-pipeline.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | embeddings <- nlp_word_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "embeddings") 10 | 11 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer, embeddings) 12 | fit_pipeline <- ml_fit(pipeline, text_tbl) 13 | 14 | assign("sc", sc, envir = parent.frame()) 15 | assign("fit_pipeline", fit_pipeline, envir = parent.frame()) 16 | assign("text_tbl", text_tbl, envir = parent.frame()) 17 | }) 18 | 19 | teardown({ 20 | spark_disconnect(sc) 21 | rm(sc, envir = .GlobalEnv) 22 | rm(fit_pipeline, envir = .GlobalEnv) 23 | rm(text_tbl, envir = .GlobalEnv) 24 | }) 25 | 26 | test_that("nlp_light_pipeline data frame annotate", { 27 | light_pipeline <- nlp_light_pipeline(fit_pipeline) 28 | result <- nlp_annotate(light_pipeline, text_tbl, "text") 29 | expect_true("embeddings" %in% colnames(result)) 30 | }) 31 | 32 | test_that("nlp_light_pipeline pre-trained", { 33 | pipeline <- nlp_pretrained_pipeline(sc, "explain_document_ml", lang = "en") 34 | light_pipeline <- nlp_light_pipeline(pipeline) 35 | result <- nlp_annotate(light_pipeline, "French author who helped pioneer the science-fiction genre. Verne wrate about space, air, and underwater travel before navigable aircrast and practical submarines were invented, and before any means of space travel had been devised.") 36 | expect_true("token" %in% names(result)) 37 | }) 38 | 39 | -------------------------------------------------------------------------------- /tests/testthat/testthat-longformer-embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | rm(sc, envir = .GlobalEnv) 20 | rm(pipeline, envir = .GlobalEnv) 21 | rm(test_data, envir = .GlobalEnv) 22 | }) 23 | 24 | test_that("nlp_longformer_embeddings pretrained", { 25 | model <- nlp_longformer_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "embeddings") 26 | transformed_data <- ml_transform(model, test_data) 27 | expect_true("embeddings" %in% colnames(transformed_data)) 28 | }) 29 | 30 | test_that("nlp_longformer_embeddings load", { 31 | model_files <- list.files("~/cache_pretrained/") 32 | bert_model_file <- max(Filter(function(s) startsWith(s, "longformer_base"), model_files)) 33 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 34 | transformed_data <- ml_transform(model, test_data) 35 | expect_true("embeddings" %in% colnames(transformed_data)) 36 | }) 37 | -------------------------------------------------------------------------------- /tests/testthat/testthat-longformer-for-token-classification.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | rm(sc, envir = .GlobalEnv) 20 | rm(pipeline, envir = .GlobalEnv) 21 | rm(test_data, envir = .GlobalEnv) 22 | }) 23 | 24 | test_that("nlp_longformer_token_classification pretrained", { 25 | model <- nlp_longformer_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert") 26 | transformed_data <- ml_transform(model, test_data) 27 | expect_true("bert" %in% colnames(transformed_data)) 28 | }) 29 | 30 | test_that("nlp_longformer_token_classification load", { 31 | model_files <- list.files("~/cache_pretrained/") 32 | bert_model_file <- max(Filter(function(s) startsWith(s, "longformer_base_token"), model_files)) 33 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 34 | model <- nlp_set_output_col(model, "label") 35 | transformed_data <- ml_transform(model, test_data) 36 | expect_true("label" %in% colnames(transformed_data)) 37 | }) -------------------------------------------------------------------------------- /tests/testthat/testthat-ngram-generator.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | spark_disconnect(sc) 20 | rm(sc, envir = .GlobalEnv) 21 | rm(pipeline, envir = .GlobalEnv) 22 | rm(test_data, envir = .GlobalEnv) 23 | }) 24 | 25 | test_that("nlp_ngram_generator param setting", { 26 | test_args <- list( 27 | input_cols = c("string1"), 28 | output_col = "string1", 29 | n = 2, 30 | enable_cumulative = TRUE, 31 | delimiter = "_" 32 | ) 33 | 34 | test_param_setting(sc, nlp_ngram_generator, test_args) 35 | }) 36 | 37 | test_that("nlp_ngram_generator spark_connection", { 38 | test_annotator <- nlp_ngram_generator(sc, input_cols = c("token"), output_col = "ngrams", n = 2) 39 | transformed_data <- ml_transform(test_annotator, test_data) 40 | expect_true("ngrams" %in% colnames(transformed_data)) 41 | }) 42 | 43 | test_that("nlp_ngram_generator ml_pipeline", { 44 | test_annotator <- nlp_ngram_generator(pipeline, input_cols = c("token"), output_col = "ngrams", n = 2, enable_cumulative = TRUE) 45 | transformed_data <- ml_fit_and_transform(test_annotator, test_data) 46 | expect_true("ngrams" %in% colnames(transformed_data)) 47 | }) 48 | 49 | test_that("nlp_ngram_generator tbl_spark", { 50 | transformed_data <- nlp_ngram_generator(test_data, input_cols = c("token"), output_col = "ngrams") 51 | expect_true("ngrams" %in% colnames(transformed_data)) 52 | }) 53 | 54 | -------------------------------------------------------------------------------- /tests/testthat/testthat-pretrained-pipeline.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | assign("sc", sc, envir = parent.frame()) 6 | assign("text_tbl", text_tbl, envir = parent.frame()) 7 | }) 8 | 9 | teardown({ 10 | spark_disconnect(sc) 11 | rm(sc, envir = .GlobalEnv) 12 | rm(text_tbl, envir = .GlobalEnv) 13 | }) 14 | 15 | test_that("nlp_pretrained_pipeline() tbl_spark", { 16 | result <- nlp_pretrained_pipeline(text_tbl, "recognize_entities_dl") 17 | expect_true("entities" %in% colnames(result)) 18 | }) 19 | 20 | test_that("nlp_pretrained_pipeline() spark_connection", { 21 | result <- nlp_pretrained_pipeline(sc, "recognize_entities_dl") 22 | expect_equal(jobj_class(spark_jobj(result)), c("PretrainedPipeline", "Object")) 23 | }) 24 | 25 | test_that("nlp_pretrained_pipeline annotate", { 26 | pipeline <- nlp_pretrained_pipeline(sc, "recognize_entities_dl") 27 | annotations <- nlp_annotate(pipeline, text_tbl, column = "text") 28 | expect_true("entities" %in% colnames(annotations)) 29 | }) 30 | 31 | test_that("as_pipeline_model().nlp_pretrained_pipeline", { 32 | pipeline <- nlp_pretrained_pipeline(sc, "recognize_entities_dl") 33 | pm <- as_pipeline_model(pipeline) 34 | expect_s3_class(pm, "ml_pipeline_model") 35 | }) 36 | -------------------------------------------------------------------------------- /tests/testthat/testthat-pubtator.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("nlp_pubtator_read_dataset", { 3 | sc <- testthat_spark_connection() 4 | pubtator <- nlp_pubtator_read_dataset(sc, here::here("tests", "testthat", "data", "corpus_pubtator_sample.txt")) 5 | expect_true("doc_id" %in% colnames(pubtator)) 6 | }) -------------------------------------------------------------------------------- /tests/testthat/testthat-recursive-pipeline.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | #pipeline <- ml_pipeline(assembler, sentdetect, tokenizer, embeddings) 6 | #fit_pipeline <- ml_fit(pipeline, text_tbl) 7 | 8 | assign("sc", sc, envir = parent.frame()) 9 | #assign("fit_pipeline", fit_pipeline, envir = parent.frame()) 10 | assign("text_tbl", text_tbl, envir = parent.frame()) 11 | }) 12 | 13 | teardown({ 14 | spark_disconnect(sc) 15 | rm(sc, envir = .GlobalEnv) 16 | #rm(fit_pipeline, envir = .GlobalEnv) 17 | rm(text_tbl, envir = .GlobalEnv) 18 | }) 19 | 20 | test_that("nlp_recursive_pipeline spark connection", { 21 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 22 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 23 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 24 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 25 | 26 | recursive_pipeline <- nlp_recursive_pipeline(sc) %>% 27 | ml_add_stage(assembler) %>% 28 | ml_add_stage(sentdetect) %>% 29 | ml_add_stage(tokenizer) 30 | 31 | result <- ml_fit_and_transform(recursive_pipeline, text_tbl) 32 | expect_true("token" %in% colnames(result)) 33 | }) 34 | 35 | test_that("nlp_recursive_pipeline pipeline stages", { 36 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 37 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 38 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 39 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 40 | 41 | recursive_pipeline <- nlp_recursive_pipeline(assembler, sentdetect, tokenizer) 42 | 43 | result <- ml_fit_and_transform(recursive_pipeline, text_tbl) 44 | expect_true("token" %in% colnames(result)) 45 | }) -------------------------------------------------------------------------------- /tests/testthat/testthat-roberta-embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | rm(sc, envir = .GlobalEnv) 20 | rm(pipeline, envir = .GlobalEnv) 21 | rm(test_data, envir = .GlobalEnv) 22 | }) 23 | 24 | 25 | test_that("nlp_roberta_embeddings pretrained", { 26 | model <- nlp_roberta_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "embeddings") 27 | transformed_data <- ml_transform(model, test_data) 28 | expect_true("embeddings" %in% colnames(transformed_data)) 29 | }) 30 | 31 | test_that("nlp_distilbert_embeddings load", { 32 | model_files <- list.files("~/cache_pretrained/") 33 | bert_model_file <- max(Filter(function(s) startsWith(s, "roberta_base"), model_files)) 34 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 35 | transformed_data <- ml_transform(model, test_data) 36 | expect_true("embeddings" %in% colnames(transformed_data)) 37 | }) 38 | 39 | -------------------------------------------------------------------------------- /tests/testthat/testthat-roberta-for-token-classification.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | rm(sc, envir = .GlobalEnv) 20 | rm(pipeline, envir = .GlobalEnv) 21 | rm(test_data, envir = .GlobalEnv) 22 | }) 23 | 24 | test_that("nlp_roberta_token_classification pretrained", { 25 | model <- nlp_roberta_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert") 26 | transformed_data <- ml_transform(model, test_data) 27 | expect_true("bert" %in% colnames(transformed_data)) 28 | }) 29 | 30 | test_that("nlp_roberta_token_classification load", { 31 | model_files <- list.files("~/cache_pretrained/") 32 | bert_model_file <- max(Filter(function(s) startsWith(s, "roberta_base_token"), model_files)) 33 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 34 | model <- nlp_set_output_col(model, "label") 35 | transformed_data <- ml_transform(model, test_data) 36 | expect_true("label" %in% colnames(transformed_data)) 37 | }) -------------------------------------------------------------------------------- /tests/testthat/testthat-roberta_sentence_embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | 8 | pipeline <- ml_pipeline(assembler) 9 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 10 | 11 | assign("sc", sc, envir = parent.frame()) 12 | assign("pipeline", pipeline, envir = parent.frame()) 13 | assign("test_data", test_data, envir = parent.frame()) 14 | }) 15 | 16 | teardown({ 17 | spark_disconnect(sc) 18 | rm(sc, envir = .GlobalEnv) 19 | rm(pipeline, envir = .GlobalEnv) 20 | rm(test_data, envir = .GlobalEnv) 21 | }) 22 | 23 | test_that("nlp_roberta_sentence_embeddings pretrained", { 24 | model <- nlp_roberta_sentence_embeddings_pretrained(sc, input_cols = c("document"), output_col = "roberta_sentence_embeddings") 25 | transformed_data <- ml_transform(model, test_data) 26 | expect_true("roberta_sentence_embeddings" %in% colnames(transformed_data)) 27 | 28 | expect_true(inherits(model, "nlp_roberta_sentence_embeddings")) 29 | }) 30 | 31 | -------------------------------------------------------------------------------- /tests/testthat/testthat-sentence-detector.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 5 | pipeline <- ml_pipeline(assembler) 6 | document_data <- ml_transform(assembler, text_tbl) 7 | 8 | assign("sc", sc, envir = parent.frame()) 9 | assign("pipeline", pipeline, envir = parent.frame()) 10 | assign("document_data", document_data, envir = parent.frame()) 11 | }) 12 | 13 | teardown({ 14 | spark_disconnect(sc) 15 | rm(sc, envir = .GlobalEnv) 16 | rm(pipeline, envir = .GlobalEnv) 17 | rm(document_data, envir = .GlobalEnv) 18 | }) 19 | 20 | test_that("nlp_sentence_detector() param setting", { 21 | test_args <- list( 22 | input_cols = c("document"), 23 | output_col = "sentence", 24 | custom_bounds = c(":"), 25 | use_custom_only = FALSE, 26 | use_abbreviations = TRUE, 27 | explode_sentences = FALSE, 28 | detect_lists = TRUE, 29 | min_length = 20, 30 | max_length = 400, 31 | split_length = 250 32 | ) 33 | test_param_setting(sc, nlp_sentence_detector, test_args) 34 | }) 35 | 36 | test_that("nlp_sentence_detector() spark_connection", { 37 | detector <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 38 | transformed_data <- ml_transform(detector, document_data) 39 | 40 | expect_true("sentence" %in% colnames(transformed_data)) 41 | }) 42 | 43 | test_that("nlp_sentence_detector() ml_pipeline", { 44 | detector <- nlp_sentence_detector(pipeline, input_cols = c("document"), output_col = "sentence") 45 | pipeline <- ml_pipeline(detector) 46 | 47 | transformed_data <- ml_fit_and_transform(pipeline, document_data) 48 | 49 | expect_true("sentence" %in% colnames(transformed_data)) 50 | }) 51 | 52 | test_that("nlp_sentence_detector() tbl_spark", { 53 | transformed_data <- nlp_sentence_detector(document_data, input_cols = c("document"), output_col = "sentence") 54 | expect_true("document" %in% colnames(transformed_data)) 55 | }) 56 | -------------------------------------------------------------------------------- /tests/testthat/testthat-stemmer.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | spark_disconnect(sc) 20 | rm(sc, envir = .GlobalEnv) 21 | rm(pipeline, envir = .GlobalEnv) 22 | rm(test_data, envir = .GlobalEnv) 23 | }) 24 | 25 | test_that("stemmer param setting", { 26 | test_args <- list( 27 | input_cols = c("string1"), 28 | output_col = "string1", 29 | language = "string1" 30 | ) 31 | 32 | test_param_setting(sc, nlp_stemmer, test_args) 33 | }) 34 | 35 | test_that("nlp_stemmer spark_connection", { 36 | test_annotator <- nlp_stemmer(sc, input_cols = c("token"), output_col = "stem") 37 | transformed_data <- ml_transform(test_annotator, test_data) 38 | expect_true("stem" %in% colnames(transformed_data)) 39 | }) 40 | 41 | test_that("nlp_stemmer ml_pipeline", { 42 | test_annotator <- nlp_stemmer(pipeline, input_cols = c("token"), output_col = "stem") 43 | transformed_data <- ml_fit_and_transform(test_annotator, test_data) 44 | expect_true("stem" %in% colnames(transformed_data)) 45 | }) 46 | 47 | test_that("nlp_stemmer tbl_spark", { 48 | transformed_data <- nlp_stemmer(test_data, input_cols = c("token"), output_col = "stem") 49 | expect_true("stem" %in% colnames(transformed_data)) 50 | }) 51 | 52 | -------------------------------------------------------------------------------- /tests/testthat/testthat-token-assembler.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "normalized") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | spark_disconnect(sc) 20 | rm(sc, envir = .GlobalEnv) 21 | rm(pipeline, envir = .GlobalEnv) 22 | rm(test_data, envir = .GlobalEnv) 23 | }) 24 | 25 | test_that("token_assembler param setting", { 26 | # TODO: edit these to make them legal values for the parameters 27 | test_args <- list( 28 | input_cols = c("string1", "string2"), 29 | output_col = "string1" 30 | ) 31 | 32 | test_param_setting(sc, nlp_token_assembler, test_args) 33 | }) 34 | 35 | test_that("nlp_token_assembler spark_connection", { 36 | test_annotator <- nlp_token_assembler(sc, input_cols = c("document", "normalized"), output_col = "assembled") 37 | transformed_data <- ml_transform(test_annotator, test_data) 38 | expect_true("assembled" %in% colnames(transformed_data)) 39 | }) 40 | 41 | test_that("nlp_token_assembler ml_pipeline", { 42 | test_annotator <- nlp_token_assembler(pipeline, input_cols = c("document", "normalized"), output_col = "assembled") 43 | transformed_data <- ml_fit_and_transform(test_annotator, test_data) 44 | expect_true("assembled" %in% colnames(transformed_data)) 45 | }) 46 | 47 | test_that("nlp_token_assembler tbl_spark", { 48 | transformed_data <- nlp_token_assembler(test_data, input_cols = c("document", "normalized"), output_col = "assembled") 49 | expect_true("assembled" %in% colnames(transformed_data)) 50 | }) 51 | 52 | -------------------------------------------------------------------------------- /tests/testthat/testthat-xlm-roberta-embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | rm(sc, envir = .GlobalEnv) 20 | rm(pipeline, envir = .GlobalEnv) 21 | rm(test_data, envir = .GlobalEnv) 22 | }) 23 | 24 | test_that("nlp_xlm_roberta_embeddings pretrained", { 25 | model <- nlp_xlm_roberta_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "embeddings", 26 | name = "xlm_roberta_base") 27 | transformed_data <- ml_transform(model, test_data) 28 | expect_true("embeddings" %in% colnames(transformed_data)) 29 | }) 30 | 31 | test_that("nlp_xlm_roberta_embeddings load", { 32 | model_files <- list.files("~/cache_pretrained/") 33 | bert_model_file <- max(Filter(function(s) startsWith(s, "xlm_roberta_base"), model_files)) 34 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 35 | transformed_data <- ml_transform(model, test_data) 36 | expect_true("embeddings" %in% colnames(transformed_data)) 37 | }) 38 | -------------------------------------------------------------------------------- /tests/testthat/testthat-xlm_roberta-for-token-classification.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | rm(sc, envir = .GlobalEnv) 20 | rm(pipeline, envir = .GlobalEnv) 21 | rm(test_data, envir = .GlobalEnv) 22 | }) 23 | 24 | test_that("nlp_xlm_roberta_token_classification pretrained", { 25 | model <- nlp_xlm_roberta_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert") 26 | transformed_data <- ml_transform(model, test_data) 27 | expect_true("bert" %in% colnames(transformed_data)) 28 | }) 29 | 30 | test_that("nlp_xlm_roberta_token_classification load", { 31 | model_files <- list.files("~/cache_pretrained/") 32 | bert_model_file <- max(Filter(function(s) startsWith(s, "xlm_roberta_base_token"), model_files)) 33 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 34 | model <- nlp_set_output_col(model, "label") 35 | transformed_data <- ml_transform(model, test_data) 36 | expect_true("label" %in% colnames(transformed_data)) 37 | }) -------------------------------------------------------------------------------- /tests/testthat/testthat-xlm_roberta_sentence_embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | 8 | pipeline <- ml_pipeline(assembler) 9 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 10 | 11 | assign("sc", sc, envir = parent.frame()) 12 | assign("pipeline", pipeline, envir = parent.frame()) 13 | assign("test_data", test_data, envir = parent.frame()) 14 | }) 15 | 16 | teardown({ 17 | spark_disconnect(sc) 18 | rm(sc, envir = .GlobalEnv) 19 | rm(pipeline, envir = .GlobalEnv) 20 | rm(test_data, envir = .GlobalEnv) 21 | }) 22 | 23 | test_that("nlp_bert_sentence_embeddings pretrained", { 24 | model <- nlp_bert_sentence_embeddings_pretrained(sc, input_cols = c("document"), output_col = "bert_sentence_embeddings") 25 | transformed_data <- ml_transform(model, test_data) 26 | expect_true("bert_sentence_embeddings" %in% colnames(transformed_data)) 27 | 28 | expect_true(inherits(model, "nlp_bert_sentence_embeddings")) 29 | }) 30 | 31 | -------------------------------------------------------------------------------- /tests/testthat/testthat-xlnet-embeddings.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | # config <- spark_config() 4 | # config$`sparklyr.shell.driver-memory` <- "8G" 5 | # sc <- spark_connect(master = "local", version = "2.4.3", config = config) 6 | text_tbl <- testthat_tbl("test_text") 7 | 8 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 9 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 10 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 11 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 12 | 13 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 14 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 15 | 16 | assign("sc", sc, envir = parent.frame()) 17 | assign("pipeline", pipeline, envir = parent.frame()) 18 | assign("test_data", test_data, envir = parent.frame()) 19 | }) 20 | 21 | teardown({ 22 | spark_disconnect(sc) 23 | rm(sc, envir = .GlobalEnv) 24 | rm(pipeline, envir = .GlobalEnv) 25 | rm(test_data, envir = .GlobalEnv) 26 | }) 27 | 28 | test_that("nlp_xlnet_embeddings pretrained", { 29 | model <- nlp_xlnet_embeddings_pretrained(sc, input_cols = c("sentence", "token"), output_col = "xlnet") 30 | transformed_data <- ml_transform(model, test_data) 31 | expect_true("xlnet" %in% colnames(transformed_data)) 32 | }) 33 | 34 | test_that("nlp_xlnet_embeddings load", { 35 | model_files <- list.files("~/cache_pretrained/") 36 | xlnet_model_file <- max(Filter(function(s) startsWith(s, "xlnet_base"), model_files)) 37 | model <- ml_load(sc, paste0("~/cache_pretrained/", xlnet_model_file)) 38 | transformed_data <- ml_transform(model, test_data) 39 | expect_true("xlnet" %in% colnames(transformed_data)) 40 | }) 41 | -------------------------------------------------------------------------------- /tests/testthat/testthat-xlnet-for-token-classification.R: -------------------------------------------------------------------------------- 1 | setup({ 2 | sc <- testthat_spark_connection() 3 | text_tbl <- testthat_tbl("test_text") 4 | 5 | # These lines should set a pipeline that will ultimately create the columns needed for testing the annotator 6 | assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document") 7 | sentdetect <- nlp_sentence_detector(sc, input_cols = c("document"), output_col = "sentence") 8 | tokenizer <- nlp_tokenizer(sc, input_cols = c("sentence"), output_col = "token") 9 | 10 | pipeline <- ml_pipeline(assembler, sentdetect, tokenizer) 11 | test_data <- ml_fit_and_transform(pipeline, text_tbl) 12 | 13 | assign("sc", sc, envir = parent.frame()) 14 | assign("pipeline", pipeline, envir = parent.frame()) 15 | assign("test_data", test_data, envir = parent.frame()) 16 | }) 17 | 18 | teardown({ 19 | rm(sc, envir = .GlobalEnv) 20 | rm(pipeline, envir = .GlobalEnv) 21 | rm(test_data, envir = .GlobalEnv) 22 | }) 23 | 24 | test_that("nlp_xlnet_token_classification pretrained", { 25 | model <- nlp_xlnet_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "bert") 26 | transformed_data <- ml_transform(model, test_data) 27 | expect_true("bert" %in% colnames(transformed_data)) 28 | }) 29 | 30 | test_that("nlp_xlnet_token_classification load", { 31 | model_files <- list.files("~/cache_pretrained/") 32 | bert_model_file <- max(Filter(function(s) startsWith(s, "xlnet_base_token"), model_files)) 33 | model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file)) 34 | model <- nlp_set_output_col(model, "label") 35 | transformed_data <- ml_transform(model, test_data) 36 | expect_true("label" %in% colnames(transformed_data)) 37 | }) -------------------------------------------------------------------------------- /tests/testthat/tf_graphs/RE_in1200D_out20.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/tf_graphs/RE_in1200D_out20.pb -------------------------------------------------------------------------------- /tests/testthat/tf_graphs/blstm_34_32_30_200_6.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/tf_graphs/blstm_34_32_30_200_6.pb -------------------------------------------------------------------------------- /tests/testthat/tf_graphs/blstm_5_200_128_67.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r-spark/sparknlp/4c2ad871cc7fec46f8574f9361c78b4bed39c924/tests/testthat/tf_graphs/blstm_5_200_128_67.pb --------------------------------------------------------------------------------