├── LICENSE ├── tools ├── logo.png ├── logo.xcf ├── logo_black.xcf ├── logo_white.xcf └── fasttext-logo-color-web.png ├── tests ├── testthat.R └── testthat │ ├── test-prefix.R │ ├── test-unsupervised.R │ └── test-supervised.R ├── data ├── test_sentences.rda ├── train_sentences.rda └── stop_words_sentences.rda ├── inst ├── extdata │ ├── model_classification_test.bin │ └── model_unsupervised_test.bin └── include │ ├── fastrtext.h │ └── fastrtext_RcppExports.h ├── .gitignore ├── CRAN-RELEASE ├── cleanup ├── src ├── main.h ├── fasttext │ ├── real.h │ ├── matrix.cc │ ├── matrix.h │ ├── utils.cc │ ├── utils.h │ ├── vector.h │ ├── quantmatrix.h │ ├── productquantizer.h │ ├── meter.h │ ├── model.h │ ├── meter.cc │ ├── vector.cc │ ├── args.h │ ├── densematrix.h │ ├── model.cc │ ├── autotune.h │ ├── dictionary.h │ ├── quantmatrix.cc │ ├── loss.h │ ├── densematrix.cc │ ├── fasttext.h │ └── productquantizer.cc ├── r_compliance.cc ├── r_compliance.h ├── Makevars ├── add_prefix.cpp └── RcppExports.cpp ├── docs ├── pkgdown.yml ├── link.svg ├── docsearch.js ├── pkgdown.js ├── LICENSE-text.html ├── pkgdown.css ├── authors.html ├── news │ └── index.html ├── articles │ └── index.html └── reference │ ├── print_help.html │ ├── stop_words_sentences.html │ ├── Rcpp_fastrtext-class.html │ ├── load_model.html │ ├── add_prefix.html │ └── get_word_distance.html ├── .Rbuildignore ├── .travis.yml ├── man ├── print_help.Rd ├── stop_words_sentences.Rd ├── load_model.Rd ├── get_parameters.Rd ├── get_labels.Rd ├── get_dictionary.Rd ├── add_prefix.Rd ├── Rcpp_fastrtext-class.Rd ├── get_sentence_representation.Rd ├── get_word_distance.Rd ├── get_word_ids.Rd ├── get_tokenized_text.Rd ├── get_word_vectors.Rd ├── get_hamming_loss.Rd ├── get_nn.Rd ├── add_tags.Rd ├── fastrtext.Rd ├── predict.Rcpp_fastrtext.Rd ├── execute.Rd ├── test_sentences.Rd ├── train_sentences.Rd ├── build_vectors.Rd └── build_supervised.Rd ├── R ├── zzz.R ├── RcppExports.R └── data.R ├── NAMESPACE ├── _pkgdown.yml ├── appveyor.yml ├── vignettes ├── unsupervised_learning.Rmd ├── supervised_learning.Rmd └── list_commands.Rmd ├── data-raw └── create_models.R ├── DESCRIPTION ├── README.md ├── NEWS.md ├── cran-comments.md └── index.md /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2017 2 | COPYRIGHT HOLDER: Michaël Benesty -------------------------------------------------------------------------------- /tools/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/tools/logo.png -------------------------------------------------------------------------------- /tools/logo.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/tools/logo.xcf -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(fastrtext) 3 | 4 | test_check("fastrtext") 5 | -------------------------------------------------------------------------------- /tools/logo_black.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/tools/logo_black.xcf -------------------------------------------------------------------------------- /tools/logo_white.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/tools/logo_white.xcf -------------------------------------------------------------------------------- /data/test_sentences.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/data/test_sentences.rda -------------------------------------------------------------------------------- /data/train_sentences.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/data/train_sentences.rda -------------------------------------------------------------------------------- /data/stop_words_sentences.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/data/stop_words_sentences.rda -------------------------------------------------------------------------------- /tools/fasttext-logo-color-web.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/tools/fasttext-logo-color-web.png -------------------------------------------------------------------------------- /inst/extdata/model_classification_test.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/inst/extdata/model_classification_test.bin -------------------------------------------------------------------------------- /inst/extdata/model_unsupervised_test.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/inst/extdata/model_unsupervised_test.bin -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.Rproj 2 | .Rproj.user 3 | .Rhistory 4 | .RData 5 | build/* 6 | *.o 7 | *.so 8 | *.dll 9 | data-raw/*.bin 10 | data-raw/*.vec 11 | -------------------------------------------------------------------------------- /CRAN-RELEASE: -------------------------------------------------------------------------------- 1 | This package was submitted to CRAN on 2019-10-27. 2 | Once it is accepted, delete this file and tag the release (commit 7c1c7cdf4a). 3 | -------------------------------------------------------------------------------- /cleanup: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | rm -Rf src/*.o src/fasttext/*.o src/*.so src/fasttext/*.so src/*.dll src/fasttext/*.dll src/*.dylib src/fasttext/*.dylib src/symbols.rds 4 | -------------------------------------------------------------------------------- /src/main.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "fasttext/fasttext.h" 6 | #include "fasttext/args.h" 7 | 8 | int main(int argc, char** argv); 9 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.3.1 2 | pkgdown: 1.3.0 3 | pkgdown_sha: ~ 4 | articles: 5 | list_commands: list_commands.html 6 | supervised_learning: supervised_learning.html 7 | unsupervised_learning: unsupervised_learning.html 8 | 9 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CRAN-RELEASE$ 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | ^\.travis\.yml$ 5 | ^appveyor\.yml$ 6 | ^/tools/*\.png$ 7 | ^docs$ 8 | ^README\.md$ 9 | ^cran-comments\.md$ 10 | ^.*.\.o$ 11 | ^_pkgdown\.yml$ 12 | ^data-raw$ 13 | index.md 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | sudo: false 5 | cache: packages 6 | 7 | r_github_packages: 8 | - r-lib/covr 9 | 10 | after_success: 11 | - travis_wait 180 Rscript -e 'covr::codecov()' 12 | -------------------------------------------------------------------------------- /inst/include/fastrtext.h: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #ifndef RCPP_fastrtext_H_GEN_ 5 | #define RCPP_fastrtext_H_GEN_ 6 | 7 | #include "fastrtext_RcppExports.h" 8 | 9 | #endif // RCPP_fastrtext_H_GEN_ 10 | -------------------------------------------------------------------------------- /src/fasttext/real.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | namespace fasttext { 12 | 13 | typedef float real; 14 | 15 | } 16 | -------------------------------------------------------------------------------- /man/print_help.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{print_help} 4 | \alias{print_help} 5 | \title{Print help} 6 | \usage{ 7 | print_help() 8 | } 9 | \description{ 10 | Print command information, mainly to use with \code{\link[=execute]{execute()}} \code{function}. 11 | } 12 | \examples{ 13 | \dontrun{ 14 | print_help() 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /man/stop_words_sentences.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{stop_words_sentences} 5 | \alias{stop_words_sentences} 6 | \title{Stop words list} 7 | \format{Character vector of stop words} 8 | \source{ 9 | \url{https://archive.ics.uci.edu/ml/index.php} 10 | } 11 | \usage{ 12 | stop_words_sentences 13 | } 14 | \description{ 15 | List of words that can be safely removed from sentences. 16 | } 17 | \keyword{datasets} 18 | -------------------------------------------------------------------------------- /tests/testthat/test-prefix.R: -------------------------------------------------------------------------------- 1 | context("test word prefix") 2 | 3 | test_that("test unique prefix", { 4 | expect_equal(object = add_prefix(c("this is a test", "this is another test"), "#"), 5 | expected = c("#this #is #a #test", 6 | "#this #is #another #test")) 7 | }) 8 | 9 | test_that("test multiple prefixes", { 10 | expect_equal(object = add_prefix(c("this is a test", "this is another test"), c("#", "*")), 11 | expected = c("#this #is #a #test", 12 | "*this *is *another *test")) 13 | }) 14 | -------------------------------------------------------------------------------- /src/fasttext/matrix.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "matrix.h" 10 | 11 | namespace fasttext { 12 | 13 | Matrix::Matrix() : m_(0), n_(0) {} 14 | 15 | Matrix::Matrix(int64_t m, int64_t n) : m_(m), n_(n) {} 16 | 17 | int64_t Matrix::size(int64_t dim) const { 18 | assert(dim == 0 || dim == 1); 19 | if (dim == 0) { 20 | return m_; 21 | } 22 | return n_; 23 | } 24 | 25 | } // namespace fasttext 26 | -------------------------------------------------------------------------------- /man/load_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{load_model} 4 | \alias{load_model} 5 | \title{Load an existing fastText trained model} 6 | \usage{ 7 | load_model(path) 8 | } 9 | \arguments{ 10 | \item{path}{path to the existing model} 11 | } 12 | \description{ 13 | Load and return a pointer to an existing model which will be used in other functions of this package. 14 | } 15 | \examples{ 16 | 17 | library(fastrtext) 18 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext") 19 | model <- load_model(model_test_path) 20 | } 21 | -------------------------------------------------------------------------------- /src/r_compliance.cc: -------------------------------------------------------------------------------- 1 | // Content of this file is added to each source of fastText to change some behaviours 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "real.h" 7 | 8 | void exit_fasttext(int status_code) { 9 | if (status_code != EXIT_SUCCESS) { 10 | Rcpp::stop("Failure in fastrtext. Exit code: " + std::to_string(status_code)); 11 | } 12 | } 13 | 14 | // catch interrupt from the user 15 | // void interrupt_or_print(double maxDuration) { 16 | // Rcpp::checkUserInterrupt(); 17 | // printInfo(maxDuration); 18 | // } 19 | 20 | namespace std { 21 | std::ostream Rcout(Rcpp::Rcout.rdbuf()); 22 | } 23 | -------------------------------------------------------------------------------- /man/get_parameters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{get_parameters} 4 | \alias{get_parameters} 5 | \title{Export hyper parameters} 6 | \usage{ 7 | get_parameters(model) 8 | } 9 | \arguments{ 10 | \item{model}{trained \code{fastText} model} 11 | } 12 | \value{ 13 | \link{list} containing each parameter 14 | } 15 | \description{ 16 | Retrieve hyper parameters used to train the model 17 | } 18 | \examples{ 19 | 20 | library(fastrtext) 21 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext") 22 | model <- load_model(model_test_path) 23 | print(head(get_parameters(model), 5)) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /man/get_labels.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{get_labels} 4 | \alias{get_labels} 5 | \title{Get list of labels (supervised model)} 6 | \usage{ 7 | get_labels(model) 8 | } 9 | \arguments{ 10 | \item{model}{trained \code{fastText} model} 11 | } 12 | \value{ 13 | \link{character} containing each label 14 | } 15 | \description{ 16 | Get a \link{character} containing each label seen during training. 17 | } 18 | \examples{ 19 | 20 | library(fastrtext) 21 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext") 22 | model <- load_model(model_test_path) 23 | print(head(get_labels(model), 5)) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /man/get_dictionary.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{get_dictionary} 4 | \alias{get_dictionary} 5 | \title{Get list of known words} 6 | \usage{ 7 | get_dictionary(model) 8 | } 9 | \arguments{ 10 | \item{model}{trained \code{fastText} model} 11 | } 12 | \value{ 13 | \link{character} containing each word 14 | } 15 | \description{ 16 | Get a \link{character} containing each word seen during training. 17 | } 18 | \examples{ 19 | 20 | library(fastrtext) 21 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext") 22 | model <- load_model(model_test_path) 23 | print(head(get_dictionary(model), 5)) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /man/add_prefix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{add_prefix} 4 | \alias{add_prefix} 5 | \title{Add a prefix to each word} 6 | \usage{ 7 | add_prefix(texts, prefix) 8 | } 9 | \arguments{ 10 | \item{texts}{a \link{character} containing the original text} 11 | 12 | \item{prefix}{unit \link{character} containing the prefix to add (length == 1) or \link{character} with same length than texts} 13 | } 14 | \value{ 15 | \link{character} with prefixed words. 16 | } 17 | \description{ 18 | Add a custom prefix to each word of a a line to create different spaces. 19 | Code in C++ (efficient). 20 | } 21 | \examples{ 22 | add_prefix(c("this is a test", "this is another test"), "#") 23 | } 24 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | loadModule("FASTRTEXT_MODULE", TRUE) 2 | 3 | #' @name fastrtext 4 | #' @useDynLib fastrtext, .registration = TRUE 5 | #' @importFrom Rcpp evalCpp loadModule cpp_object_initializer 6 | #' @import methods 7 | "_PACKAGE" 8 | 9 | #' Rcpp_fastrtext class 10 | #' 11 | #' Models are [S4] objects with several slots (methods) which can be called that way: model$slot_name() 12 | #' 13 | #' @name Rcpp_fastrtext-class 14 | #' 15 | #' @slot load Load a model 16 | #' @slot predict Make a prediction 17 | #' @slot execute Execute commands 18 | #' @slot get_vectors Get vectors related to provided words 19 | #' @slot get_parameters Get parameters used to train the model 20 | #' @slot get_dictionary List all words learned 21 | #' @slot get_labels List all labels learned 22 | NULL 23 | -------------------------------------------------------------------------------- /man/Rcpp_fastrtext-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zzz.R 3 | \name{Rcpp_fastrtext-class} 4 | \alias{Rcpp_fastrtext-class} 5 | \title{Rcpp_fastrtext class} 6 | \description{ 7 | Models are \link{S4} objects with several slots (methods) which can be called that way: model$slot_name() 8 | } 9 | \section{Slots}{ 10 | 11 | \describe{ 12 | \item{\code{load}}{Load a model} 13 | 14 | \item{\code{predict}}{Make a prediction} 15 | 16 | \item{\code{execute}}{Execute commands} 17 | 18 | \item{\code{get_vectors}}{Get vectors related to provided words} 19 | 20 | \item{\code{get_parameters}}{Get parameters used to train the model} 21 | 22 | \item{\code{get_dictionary}}{List all words learned} 23 | 24 | \item{\code{get_labels}}{List all labels learned} 25 | }} 26 | 27 | -------------------------------------------------------------------------------- /man/get_sentence_representation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{get_sentence_representation} 4 | \alias{get_sentence_representation} 5 | \title{Get sentence embedding} 6 | \usage{ 7 | get_sentence_representation(model, sentences) 8 | } 9 | \arguments{ 10 | \item{model}{\code{fastText} model} 11 | 12 | \item{sentences}{\link{character} containing the sentences} 13 | } 14 | \description{ 15 | Sentence is splitted in words (using space characters), and word embeddings are averaged. 16 | } 17 | \examples{ 18 | library(fastrtext) 19 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext") 20 | model <- load_model(model_test_path) 21 | m <- get_sentence_representation(model, "this is a test") 22 | print(m) 23 | } 24 | -------------------------------------------------------------------------------- /man/get_word_distance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{get_word_distance} 4 | \alias{get_word_distance} 5 | \title{Distance between two words} 6 | \usage{ 7 | get_word_distance(model, w1, w2) 8 | } 9 | \arguments{ 10 | \item{model}{trained \code{fastText} model. Null if train a new model.} 11 | 12 | \item{w1}{first word to compare} 13 | 14 | \item{w2}{second word to compare} 15 | } 16 | \value{ 17 | a \code{scalar} with the distance 18 | } 19 | \description{ 20 | Distance is equal to \code{1 - cosine} 21 | } 22 | \examples{ 23 | 24 | library(fastrtext) 25 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext") 26 | model <- load_model(model_test_path) 27 | get_word_distance(model, "time", "timing") 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/r_compliance.h: -------------------------------------------------------------------------------- 1 | // Content of this file is added to each source of fastText to change some behaviours 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define exit(status_code) exit_fasttext(status_code) 10 | #define cerr Rcout // with cerr, no line refresh possible on R (it is an issue for learning with verbose set to 2, progress line is updated) 11 | #define cout Rcout 12 | #define main main_fastrtext // no direct call to main(), otherwise Cran complains + strange errors 13 | 14 | 15 | // catch the call to exit and call Rcpp::stop() when there is a fail 16 | void exit_fasttext(int error_code); 17 | 18 | namespace std { 19 | // Copy of Rcout in std namespace to reroute cout to R terminal with a macro 20 | extern std::ostream Rcout; 21 | } 22 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /man/get_word_ids.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{get_word_ids} 4 | \alias{get_word_ids} 5 | \title{Retrieve word IDs} 6 | \usage{ 7 | get_word_ids(model, words) 8 | } 9 | \arguments{ 10 | \item{model}{\code{fastText} model} 11 | 12 | \item{words}{\link{character} containing words to retrieve IDs} 13 | } 14 | \value{ 15 | \link{numeric} of ids 16 | } 17 | \description{ 18 | Get ID of words in the dictionary 19 | } 20 | \examples{ 21 | library(fastrtext) 22 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext") 23 | model <- load_model(model_test_path) 24 | ids <- get_word_ids(model, c("this", "is", "a", "test")) 25 | 26 | # print positions 27 | print(ids) 28 | # retrieve words in the dictionary using the positions retrieved 29 | print(get_dictionary(model)[ids]) 30 | } 31 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(predict,Rcpp_fastrtext) 4 | export(add_prefix) 5 | export(add_tags) 6 | export(build_supervised) 7 | export(build_vectors) 8 | export(execute) 9 | export(get_dictionary) 10 | export(get_hamming_loss) 11 | export(get_labels) 12 | export(get_nn) 13 | export(get_parameters) 14 | export(get_sentence_representation) 15 | export(get_tokenized_text) 16 | export(get_word_distance) 17 | export(get_word_ids) 18 | export(get_word_vectors) 19 | export(load_model) 20 | export(print_help) 21 | import(methods) 22 | importFrom(Rcpp,cpp_object_initializer) 23 | importFrom(Rcpp,evalCpp) 24 | importFrom(Rcpp,loadModule) 25 | importFrom(assertthat,assert_that) 26 | importFrom(assertthat,is.count) 27 | importFrom(assertthat,is.flag) 28 | importFrom(assertthat,is.number) 29 | importFrom(assertthat,is.string) 30 | useDynLib(fastrtext, .registration = TRUE) 31 | -------------------------------------------------------------------------------- /man/get_tokenized_text.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{get_tokenized_text} 4 | \alias{get_tokenized_text} 5 | \title{Tokenize text} 6 | \usage{ 7 | get_tokenized_text(model, texts) 8 | } 9 | \arguments{ 10 | \item{model}{\code{fastText} model} 11 | 12 | \item{texts}{a \link{character} containing the documents} 13 | } 14 | \value{ 15 | a \link{list} of \link{character} containing words 16 | } 17 | \description{ 18 | Separate words in a text using space characters 19 | } 20 | \examples{ 21 | library(fastrtext) 22 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext") 23 | model <- load_model(model_test_path) 24 | tokens <- get_tokenized_text(model, "this is a test") 25 | print(tokens) 26 | tokens <- get_tokenized_text(model, c("this is a test 1", "this is a second test!")) 27 | print(tokens) 28 | } 29 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | reference: 2 | - title: "General" 3 | desc: "Functions useful in both supervised and unsupervised contexts." 4 | contents: 5 | - fastrtext 6 | - Rcpp_fastrtext-class 7 | - load_model 8 | - execute 9 | - get_parameters 10 | - print_help 11 | - title: "Supervised learning" 12 | desc: "Function useful for text classification." 13 | contents: 14 | - predict.Rcpp_fastrtext 15 | - get_hamming_loss 16 | - get_labels 17 | - title: "Unsupervised learning" 18 | desc: "Functions useful to play with word representations." 19 | contents: 20 | - get_word_vectors 21 | - get_word_distance 22 | - get_nn 23 | - get_dictionary 24 | - title: data 25 | desc: "Data embedded in the package for help and tests." 26 | contents: 27 | - train_sentences 28 | - test_sentences 29 | - stop_words_sentences 30 | 31 | 32 | -------------------------------------------------------------------------------- /man/get_word_vectors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{get_word_vectors} 4 | \alias{get_word_vectors} 5 | \title{Get word embeddings} 6 | \usage{ 7 | get_word_vectors(model, words = get_dictionary(model)) 8 | } 9 | \arguments{ 10 | \item{model}{trained \code{fastText} model} 11 | 12 | \item{words}{\link{character} of words. Default: return every word from the dictionary.} 13 | } 14 | \value{ 15 | \link{matrix} containing each word embedding as a row and \code{rownames} are populated with word strings. 16 | } 17 | \description{ 18 | Return the vector representation of provided words (unsupervised training) 19 | or provided labels (supervised training). 20 | } 21 | \examples{ 22 | 23 | library(fastrtext) 24 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext") 25 | model <- load_model(model_test_path) 26 | get_word_vectors(model, c("introduction", "we")) 27 | 28 | } 29 | -------------------------------------------------------------------------------- /man/get_hamming_loss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{get_hamming_loss} 4 | \alias{get_hamming_loss} 5 | \title{Hamming loss} 6 | \usage{ 7 | get_hamming_loss(labels, predictions) 8 | } 9 | \arguments{ 10 | \item{labels}{list of labels} 11 | 12 | \item{predictions}{list returned by the predict command (including both the probability and the categories)} 13 | } 14 | \value{ 15 | a \code{scalar} with the loss 16 | } 17 | \description{ 18 | Compute the hamming loss. When there is only one category, this measure the accuracy. 19 | } 20 | \examples{ 21 | 22 | library(fastrtext) 23 | data("test_sentences") 24 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext") 25 | model <- load_model(model_test_path) 26 | sentences <- test_sentences[, "text"] 27 | test_labels <- test_sentences[, "class.text"] 28 | predictions <- predict(model, sentences) 29 | get_hamming_loss(as.list(test_labels), predictions) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /man/get_nn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{get_nn} 4 | \alias{get_nn} 5 | \title{Get nearest neighbour vectors} 6 | \usage{ 7 | get_nn(model, word, k) 8 | } 9 | \arguments{ 10 | \item{model}{trained \code{fastText} model. Null if train a new model.} 11 | 12 | \item{word}{reference word} 13 | 14 | \item{k}{\link{integer} defining the number of results to return} 15 | } 16 | \value{ 17 | \link{numeric} with distances with \link{names} as words 18 | } 19 | \description{ 20 | Find the \code{k} words with the smallest distance. 21 | First execution can be slow because of precomputation. 22 | Search is done linearly, if your model is big you may want to use an approximate neighbour algorithm from other R packages (like RcppAnnoy). 23 | } 24 | \examples{ 25 | 26 | library(fastrtext) 27 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext") 28 | model <- load_model(model_test_path) 29 | get_nn(model, "time", 10) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #' Add a prefix to each word 5 | #' 6 | #' Add a custom prefix to each word of a a line to create different spaces. 7 | #' Code in C++ (efficient). 8 | #' 9 | #' @param texts a [character] containing the original text 10 | #' @param prefix unit [character] containing the prefix to add (length == 1) or [character] with same length than texts 11 | #' @return [character] with prefixed words. 12 | #' @examples 13 | #' add_prefix(c("this is a test", "this is another test"), "#") 14 | #' @export 15 | add_prefix <- function(texts, prefix) { 16 | .Call(`_fastrtext_add_prefix`, texts, prefix) 17 | } 18 | 19 | add_pr <- function(line, prefix) { 20 | .Call(`_fastrtext_add_pr`, line, prefix) 21 | } 22 | 23 | # Register entry points for exported C++ functions 24 | methods::setLoadAction(function(ns) { 25 | .Call('_fastrtext_RcppExport_registerCCallable', PACKAGE = 'fastrtext') 26 | }) 27 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # DO NOT CHANGE the "init" and "install" sections below 2 | 3 | # Download script file from GitHub 4 | init: 5 | ps: | 6 | $ErrorActionPreference = "Stop" 7 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 8 | Import-Module '..\appveyor-tool.ps1' 9 | 10 | install: 11 | ps: Bootstrap 12 | 13 | cache: 14 | - C:\RLibrary 15 | 16 | # Adapt as necessary starting from here 17 | 18 | build_script: 19 | - travis-tool.sh install_deps 20 | 21 | test_script: 22 | - travis-tool.sh run_tests 23 | 24 | on_failure: 25 | - 7z a failure.zip *.Rcheck\* 26 | - appveyor PushArtifact failure.zip 27 | 28 | artifacts: 29 | - path: '*.Rcheck\**\*.log' 30 | name: Logs 31 | 32 | - path: '*.Rcheck\**\*.out' 33 | name: Logs 34 | 35 | - path: '*.Rcheck\**\*.fail' 36 | name: Logs 37 | 38 | - path: '*.Rcheck\**\*.Rout' 39 | name: Logs 40 | 41 | - path: '\*_*.tar.gz' 42 | name: Bits 43 | 44 | - path: '\*_*.zip' 45 | name: Bits 46 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKGROOT = ./fasttext 2 | 3 | CXX_STD = CXX11 4 | 5 | # include adds a header to each file, ugly hack to block call to exit() and replace cerr by cout 6 | PKG_CPPFLAGS = -pthread -include r_compliance.h -I$(PKGROOT) 7 | 8 | # pthread is used for multithreading by fastText 9 | PKG_LIBS = -pthread 10 | 11 | OBJECTS = add_prefix.o r_compliance.o $(PKGROOT)/autotune.o $(PKGROOT)/args.o $(PKGROOT)/matrix.o $(PKGROOT)/dictionary.o $(PKGROOT)/loss.o $(PKGROOT)/productquantizer.o $(PKGROOT)/densematrix.o $(PKGROOT)/quantmatrix.o $(PKGROOT)/vector.o $(PKGROOT)/model.o $(PKGROOT)/utils.o $(PKGROOT)/meter.o $(PKGROOT)/fasttext.o $(PKGROOT)/main.o fastrtext.o RcppExports.o 12 | 13 | # Reduce the size of the compiled library by removing unneeded debug information 14 | # Need to check if we are on Linux and if strip is installed 15 | # http://dirk.eddelbuettel.com/blog/2017/08/14/#009_compact_shared_libraries 16 | # strippedLib: $(SHLIB) 17 | # if test -e "/usr/bin/strip" && test -e "/bin/uname" && [[ `uname` == "Linux" ]] ; then /usr/bin/strip --strip-unneeded -K R_registerRoutines -K R_useDynamicSymbols $(SHLIB); fi 18 | 19 | # .phony: strippedLib 20 | -------------------------------------------------------------------------------- /vignettes/unsupervised_learning.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Unsupervised learning" 3 | author: "M. Benesty" 4 | output: rmarkdown::html_vignette 5 | date: "`r Sys.Date()`" 6 | vignette: > 7 | %\VignetteEngine{knitr::rmarkdown} 8 | %\VignetteIndexEntry{Unsupervised learning} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r unsupervised_learning} 13 | library(fastrtext) 14 | 15 | data("train_sentences") 16 | data("test_sentences") 17 | texts <- tolower(train_sentences[,"text"]) 18 | tmp_file_txt <- tempfile() 19 | tmp_file_model <- tempfile() 20 | writeLines(text = texts, con = tmp_file_txt) 21 | execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1)) 22 | 23 | model <- load_model(tmp_file_model) 24 | 25 | # test word extraction 26 | dict <- get_dictionary(model) 27 | print(head(dict, 5)) 28 | 29 | # print vector 30 | print(get_word_vectors(model, c("time", "timing"))) 31 | 32 | # test word distance 33 | get_word_distance(model, "time", "timing") 34 | 35 | # free memory 36 | unlink(tmp_file_txt) 37 | unlink(tmp_file_model) 38 | rm(model) 39 | gc() 40 | ``` 41 | -------------------------------------------------------------------------------- /src/fasttext/matrix.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include "real.h" 18 | 19 | namespace fasttext { 20 | 21 | class Vector; 22 | 23 | class Matrix { 24 | protected: 25 | int64_t m_; 26 | int64_t n_; 27 | 28 | public: 29 | Matrix(); 30 | explicit Matrix(int64_t, int64_t); 31 | virtual ~Matrix() = default; 32 | 33 | int64_t size(int64_t dim) const; 34 | 35 | virtual real dotRow(const Vector&, int64_t) const = 0; 36 | virtual void addVectorToRow(const Vector&, int64_t, real) = 0; 37 | virtual void addRowToVector(Vector& x, int32_t i) const = 0; 38 | virtual void addRowToVector(Vector& x, int32_t i, real a) const = 0; 39 | virtual void save(std::ostream&) const = 0; 40 | virtual void load(std::istream&) = 0; 41 | virtual void dump(std::ostream&) const = 0; 42 | }; 43 | 44 | } // namespace fasttext 45 | -------------------------------------------------------------------------------- /man/add_tags.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{add_tags} 4 | \alias{add_tags} 5 | \title{Add tags to documents} 6 | \usage{ 7 | add_tags(documents, tags, prefix = "__label__", new_lines = " ") 8 | } 9 | \arguments{ 10 | \item{documents}{texts to learn} 11 | 12 | \item{tags}{labels provided as a \link{list} or a \link{vector}. There can be 1 or more per document.} 13 | 14 | \item{prefix}{\link{character} to add in front of tag (\code{fastText} format)} 15 | 16 | \item{new_lines}{Character that replaces new lines (\code{\\r\\n}), default is space.} 17 | } 18 | \value{ 19 | \link{character} ready to be written in a file 20 | } 21 | \description{ 22 | Add tags in the `fastText`` format. 23 | This format is require for the training step. As fastText doesn't support newlines inside documents 24 | (as newlines are delimiting documents) this function also ensures that there are absolutely no 25 | new lines. By default new lines are replaced by a single space. 26 | } 27 | \examples{ 28 | library(fastrtext) 29 | tags <- list(c(1, 5), 0) 30 | documents <- c("this is a text", "this is another document") 31 | add_tags(documents = documents, tags = tags) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /man/fastrtext.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zzz.R 3 | \docType{package} 4 | \name{fastrtext} 5 | \alias{fastrtext} 6 | \alias{fastrtext-package} 7 | \title{fastrtext: 'fastText' Wrapper for Text Classification and Word Representation} 8 | \description{ 9 | Learning text representations and text classifiers may rely 10 | on the same simple and efficient approach. 'fastText' is an open-source, free, 11 | lightweight library that allows users to perform both tasks. 12 | It transforms text into continuous vectors that can later 13 | be used on many language related task. 14 | It works on standard, generic hardware (no 'GPU' required). 15 | It also includes model size reduction feature. 16 | 'fastText' original source code is available 17 | at . 18 | } 19 | \seealso{ 20 | Useful links: 21 | \itemize{ 22 | \item \url{https://github.com/pommedeterresautee/fastrtext} 23 | \item \url{https://pommedeterresautee.github.io/fastrtext/} 24 | \item Report bugs at \url{https://github.com/pommedeterresautee/fastrtext/issues} 25 | } 26 | 27 | } 28 | \author{ 29 | \strong{Maintainer}: Michaël Benesty \email{michael@benesty.fr} [copyright holder] 30 | 31 | Other contributors: 32 | \itemize{ 33 | \item Facebook, Inc \email{bojanowski@fb.com} [copyright holder] 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/fasttext/utils.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "utils.h" 10 | 11 | #include 12 | #include 13 | 14 | namespace fasttext { 15 | 16 | namespace utils { 17 | 18 | int64_t size(std::ifstream& ifs) { 19 | ifs.seekg(std::streamoff(0), std::ios::end); 20 | return ifs.tellg(); 21 | } 22 | 23 | void seek(std::ifstream& ifs, int64_t pos) { 24 | ifs.clear(); 25 | ifs.seekg(std::streampos(pos)); 26 | } 27 | 28 | double getDuration( 29 | const std::chrono::steady_clock::time_point& start, 30 | const std::chrono::steady_clock::time_point& end) { 31 | return std::chrono::duration_cast>(end - start) 32 | .count(); 33 | } 34 | 35 | ClockPrint::ClockPrint(int32_t duration) : duration_(duration) {} 36 | 37 | std::ostream& operator<<(std::ostream& out, const ClockPrint& me) { 38 | int32_t etah = me.duration_ / 3600; 39 | int32_t etam = (me.duration_ % 3600) / 60; 40 | int32_t etas = (me.duration_ % 3600) % 60; 41 | 42 | out << std::setw(3) << etah << "h" << std::setw(2) << etam << "m"; 43 | out << std::setw(2) << etas << "s"; 44 | return out; 45 | } 46 | 47 | } // namespace utils 48 | 49 | } // namespace fasttext 50 | -------------------------------------------------------------------------------- /data-raw/create_models.R: -------------------------------------------------------------------------------- 1 | # The purpose of this script is to create models 2 | # used in tests. 3 | 4 | require(fastrtext) 5 | 6 | data("train_sentences") 7 | data("test_sentences") 8 | 9 | # Unsupervised 10 | texts <- tolower(train_sentences[, "text"]) 11 | tmp_file_txt <- tempfile() 12 | tmp_file_model <- "./data-raw/model_unsupervised_test" 13 | writeLines(text = texts, con = tmp_file_txt) 14 | execute(commands = c("skipgram", 15 | "-input", tmp_file_txt, 16 | "-output", tmp_file_model, 17 | "-dim", 70, 18 | "-bucket", 1e3, 19 | "-epoch", 20)) 20 | 21 | # Supervised 22 | train_labels <- paste0("__label__", train_sentences[, "class.text"]) 23 | train_texts <- tolower(train_sentences[, "text"]) 24 | train_to_write <- paste(train_labels, train_texts) 25 | train_tmp_file_txt <- tempfile() 26 | tmp_file_model <- "./data-raw/model_classification_test" 27 | writeLines(text = train_to_write, con = train_tmp_file_txt) 28 | 29 | test_labels <- paste0("__label__", test_sentences[, "class.text"]) 30 | test_texts <- tolower(test_sentences[, "text"]) 31 | test_to_write <- paste(test_labels, test_texts) 32 | 33 | # learn model 34 | execute(commands = 35 | c("supervised", 36 | "-input", train_tmp_file_txt, 37 | "-output", tmp_file_model, 38 | "-dim", 20, 39 | "-lr", 1, 40 | "-epoch", 20, 41 | "-wordNgrams", 2, 42 | "-bucket", 1e3)) -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: fastrtext 2 | Type: Package 3 | Title: 'fastText' Wrapper for Text Classification and Word Representation 4 | Version: 0.3.4 5 | Date: 2019-10-27 6 | Authors@R: c(person("Michaël", "Benesty", role = c("aut", "cre", "cph"), email = "michael@benesty.fr"), 7 | person("Facebook, Inc", role = c("cph"), email = "bojanowski@fb.com")) 8 | Maintainer: Michaël Benesty 9 | Description: Learning text representations and text classifiers may rely 10 | on the same simple and efficient approach. 'fastText' is an open-source, free, 11 | lightweight library that allows users to perform both tasks. 12 | It transforms text into continuous vectors that can later 13 | be used on many language related task. 14 | It works on standard, generic hardware (no 'GPU' required). 15 | It also includes model size reduction feature. 16 | 'fastText' original source code is available 17 | at . 18 | URL: https://github.com/pommedeterresautee/fastrtext, https://pommedeterresautee.github.io/fastrtext/ 19 | BugReports: https://github.com/pommedeterresautee/fastrtext/issues 20 | License: MIT + file LICENSE 21 | Depends: R (>= 3.3) 22 | Imports: methods, 23 | Rcpp (>= 0.12.12), 24 | assertthat 25 | Suggests: knitr, 26 | testthat 27 | LinkingTo: Rcpp 28 | LazyData: true 29 | VignetteBuilder: knitr 30 | Roxygen: list(markdown = TRUE, roclets = c("rd", "collate", "namespace_roclet")) 31 | RoxygenNote: 6.1.1 32 | Encoding: UTF-8 33 | NeedsCompilation: yes 34 | -------------------------------------------------------------------------------- /src/fasttext/utils.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include "real.h" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #if defined(__clang__) || defined(__GNUC__) 20 | #define FASTTEXT_DEPRECATED(msg) __attribute__((__deprecated__(msg))) 21 | #elif defined(_MSC_VER) 22 | #define FASTTEXT_DEPRECATED(msg) __declspec(deprecated(msg)) 23 | #else 24 | #define FASTTEXT_DEPRECATED(msg) 25 | #endif 26 | 27 | namespace fasttext { 28 | 29 | using Predictions = std::vector>; 30 | 31 | namespace utils { 32 | 33 | int64_t size(std::ifstream&); 34 | 35 | void seek(std::ifstream&, int64_t); 36 | 37 | template 38 | bool contains(const std::vector& container, const T& value) { 39 | return std::find(container.begin(), container.end(), value) != 40 | container.end(); 41 | } 42 | 43 | double getDuration( 44 | const std::chrono::steady_clock::time_point& start, 45 | const std::chrono::steady_clock::time_point& end); 46 | 47 | class ClockPrint { 48 | public: 49 | explicit ClockPrint(int32_t duration); 50 | friend std::ostream& operator<<(std::ostream& out, const ClockPrint& me); 51 | 52 | private: 53 | int32_t duration_; 54 | }; 55 | 56 | } // namespace utils 57 | 58 | } // namespace fasttext 59 | -------------------------------------------------------------------------------- /src/fasttext/vector.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "real.h" 16 | 17 | namespace fasttext { 18 | 19 | class Matrix; 20 | 21 | class Vector { 22 | protected: 23 | std::vector data_; 24 | 25 | public: 26 | explicit Vector(int64_t); 27 | Vector(const Vector&) = default; 28 | Vector(Vector&&) noexcept = default; 29 | Vector& operator=(const Vector&) = default; 30 | Vector& operator=(Vector&&) = default; 31 | 32 | inline real* data() { 33 | return data_.data(); 34 | } 35 | inline const real* data() const { 36 | return data_.data(); 37 | } 38 | inline real& operator[](int64_t i) { 39 | return data_[i]; 40 | } 41 | inline const real& operator[](int64_t i) const { 42 | return data_[i]; 43 | } 44 | 45 | inline int64_t size() const { 46 | return data_.size(); 47 | } 48 | void zero(); 49 | void mul(real); 50 | real norm() const; 51 | void addVector(const Vector& source); 52 | void addVector(const Vector&, real); 53 | void addRow(const Matrix&, int64_t); 54 | void addRow(const Matrix&, int64_t, real); 55 | void mul(const Matrix&, const Vector&); 56 | int64_t argmax(); 57 | }; 58 | 59 | std::ostream& operator<<(std::ostream&, const Vector&); 60 | 61 | } // namespace fasttext 62 | -------------------------------------------------------------------------------- /man/predict.Rcpp_fastrtext.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{predict.Rcpp_fastrtext} 4 | \alias{predict.Rcpp_fastrtext} 5 | \title{Get predictions (for supervised model)} 6 | \usage{ 7 | \method{predict}{Rcpp_fastrtext}(object, sentences, k = 1, 8 | simplify = FALSE, unlock_empty_predictions = FALSE, threshold = 0, 9 | ...) 10 | } 11 | \arguments{ 12 | \item{object}{trained \code{fastText} model} 13 | 14 | \item{sentences}{\link{character} containing the sentences} 15 | 16 | \item{k}{will return the \code{k} most probable labels (default = 1)} 17 | 18 | \item{simplify}{when \link{TRUE} and \code{k} = 1, function return a (flat) \link{numeric} instead of a \link{list}} 19 | 20 | \item{unlock_empty_predictions}{\link{logical} to avoid crash when some predictions are not provided for some sentences because all their words have not been seen during training. This parameter should only be set to \link{TRUE} to debug.} 21 | 22 | \item{threshold}{used to limit number of words used. (optional; 0.0 by default)} 23 | 24 | \item{...}{not used} 25 | } 26 | \value{ 27 | \link{list} containing for each sentence the probability to be associated with \code{k} labels. 28 | } 29 | \description{ 30 | Apply the trained model to new sentences. 31 | Average word embeddings and search most similar \code{label} vector. 32 | } 33 | \examples{ 34 | 35 | library(fastrtext) 36 | data("test_sentences") 37 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext") 38 | model <- load_model(model_test_path) 39 | sentence <- test_sentences[1, "text"] 40 | print(predict(model, sentence)) 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/fasttext/quantmatrix.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | #include "real.h" 19 | 20 | #include "densematrix.h" 21 | #include "matrix.h" 22 | #include "vector.h" 23 | 24 | #include "productquantizer.h" 25 | 26 | namespace fasttext { 27 | 28 | class QuantMatrix : public Matrix { 29 | protected: 30 | std::unique_ptr pq_; 31 | std::unique_ptr npq_; 32 | 33 | std::vector codes_; 34 | std::vector norm_codes_; 35 | 36 | bool qnorm_; 37 | int32_t codesize_; 38 | 39 | public: 40 | QuantMatrix(); 41 | QuantMatrix(DenseMatrix&&, int32_t, bool); 42 | QuantMatrix(const QuantMatrix&) = delete; 43 | QuantMatrix(QuantMatrix&&) = delete; 44 | QuantMatrix& operator=(const QuantMatrix&) = delete; 45 | QuantMatrix& operator=(QuantMatrix&&) = delete; 46 | virtual ~QuantMatrix() noexcept override = default; 47 | 48 | void quantizeNorm(const Vector&); 49 | void quantize(DenseMatrix&& mat); 50 | 51 | real dotRow(const Vector&, int64_t) const override; 52 | void addVectorToRow(const Vector&, int64_t, real) override; 53 | void addRowToVector(Vector& x, int32_t i) const override; 54 | void addRowToVector(Vector& x, int32_t i, real a) const override; 55 | void save(std::ostream&) const override; 56 | void load(std::istream&) override; 57 | void dump(std::ostream&) const override; 58 | }; 59 | 60 | } // namespace fasttext 61 | -------------------------------------------------------------------------------- /src/fasttext/productquantizer.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "real.h" 18 | #include "vector.h" 19 | 20 | namespace fasttext { 21 | 22 | class ProductQuantizer { 23 | protected: 24 | const int32_t nbits_ = 8; 25 | const int32_t ksub_ = 1 << nbits_; 26 | const int32_t max_points_per_cluster_ = 256; 27 | const int32_t max_points_ = max_points_per_cluster_ * ksub_; 28 | const int32_t seed_ = 1234; 29 | const int32_t niter_ = 25; 30 | const real eps_ = 1e-7; 31 | 32 | int32_t dim_; 33 | int32_t nsubq_; 34 | int32_t dsub_; 35 | int32_t lastdsub_; 36 | 37 | std::vector centroids_; 38 | 39 | std::minstd_rand rng; 40 | 41 | public: 42 | ProductQuantizer() {} 43 | ProductQuantizer(int32_t, int32_t); 44 | 45 | real* get_centroids(int32_t, uint8_t); 46 | const real* get_centroids(int32_t, uint8_t) const; 47 | 48 | real assign_centroid(const real*, const real*, uint8_t*, int32_t) const; 49 | void Estep(const real*, const real*, uint8_t*, int32_t, int32_t) const; 50 | void MStep(const real*, real*, const uint8_t*, int32_t, int32_t); 51 | void kmeans(const real*, real*, int32_t, int32_t); 52 | void train(int, const real*); 53 | 54 | real mulcode(const Vector&, const uint8_t*, int32_t, real) const; 55 | void addcode(Vector&, const uint8_t*, int32_t, real) const; 56 | void compute_code(const real*, uint8_t*) const; 57 | void compute_codes(const real*, uint8_t*, int32_t) const; 58 | 59 | void save(std::ostream&) const; 60 | void load(std::istream&); 61 | }; 62 | 63 | } // namespace fasttext 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![fastrtext](https://github.com/pommedeterresautee/fastrtext/raw/master/tools/logo.png) 2 | ========= 3 | 4 | [![Travis-CI Build Status](https://travis-ci.org/pommedeterresautee/fastrtext.svg?branch=master)](https://travis-ci.org/pommedeterresautee/fastrtext) 5 | [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/pommedeterresautee/fastrtext?branch=master&svg=true)](https://ci.appveyor.com/project/pommedeterresautee/fastrtext) 6 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/fastrtext)](https://cran.r-project.org/package=fastrtext) 7 | [![CRAN_time_from_release](https://www.r-pkg.org/badges/ago/fastrtext)](https://cran.r-project.org/package=fastrtext) 8 | [![CRAN_Download](http://cranlogs.r-pkg.org/badges/fastrtext)](http://cran.rstudio.com/web/packages/fastrtext/index.html) 9 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 10 | [![codecov](https://codecov.io/gh/pommedeterresautee/fastrtext/branch/master/graph/badge.svg)](https://codecov.io/gh/pommedeterresautee/fastrtext) 11 | [![Follow](https://img.shields.io/twitter/follow/pommedeterre33.svg?style=social)](https://twitter.com/intent/follow?screen_name=pommedeterre33) 12 | 13 | [R Documentation](https://pommedeterresautee.github.io/fastrtext/) | [Release Notes](https://github.com/pommedeterresautee/fastrtext/blob/master/NEWS.md) | [FAQ](https://fasttext.cc/docs/en/faqs.html) | [Multilingual pretrained models](https://fasttext.cc/docs/en/crawl-vectors.html) 14 | 15 | R wrapper for [fastText](https://github.com/facebookresearch/fastText) C++ code from Facebook. 16 | 17 | FastText is an open-source, free, lightweight library that allows users to learn text representations and text classifiers. It works on standard, generic hardware. Models can later be reduced in size to even fit on mobile devices. 18 | 19 | 20 | ## License 21 | 22 | © Contributors, 2019. Licensed under a MIT license. 23 | -------------------------------------------------------------------------------- /man/execute.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{execute} 4 | \alias{execute} 5 | \title{Execute command on \code{fastText} model (including training)} 6 | \usage{ 7 | execute(commands) 8 | } 9 | \arguments{ 10 | \item{commands}{\link{character} of commands} 11 | } 12 | \description{ 13 | Use the same commands than the one to use for the command line. 14 | } 15 | \examples{ 16 | \dontrun{ 17 | # Supervised learning example 18 | library(fastrtext) 19 | 20 | data("train_sentences") 21 | data("test_sentences") 22 | 23 | # prepare data 24 | tmp_file_model <- tempfile() 25 | 26 | train_labels <- paste0("__label__", train_sentences[,"class.text"]) 27 | train_texts <- tolower(train_sentences[,"text"]) 28 | train_to_write <- paste(train_labels, train_texts) 29 | train_tmp_file_txt <- tempfile() 30 | writeLines(text = train_to_write, con = train_tmp_file_txt) 31 | 32 | test_labels <- paste0("__label__", test_sentences[,"class.text"]) 33 | test_texts <- tolower(test_sentences[,"text"]) 34 | test_to_write <- paste(test_labels, test_texts) 35 | 36 | # learn model 37 | execute(commands = c("supervised", "-input", train_tmp_file_txt, 38 | "-output", tmp_file_model, "-dim", 20, "-lr", 1, 39 | "-epoch", 20, "-wordNgrams", 2, "-verbose", 1)) 40 | 41 | model <- load_model(tmp_file_model) 42 | predict(model, sentences = test_sentences[1, "text"]) 43 | 44 | # Unsupervised learning example 45 | library(fastrtext) 46 | 47 | data("train_sentences") 48 | data("test_sentences") 49 | texts <- tolower(train_sentences[,"text"]) 50 | tmp_file_txt <- tempfile() 51 | tmp_file_model <- tempfile() 52 | writeLines(text = texts, con = tmp_file_txt) 53 | execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1)) 54 | 55 | model <- load_model(tmp_file_model) 56 | dict <- get_dictionary(model) 57 | get_word_vectors(model, head(dict, 5)) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /man/test_sentences.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{test_sentences} 5 | \alias{test_sentences} 6 | \title{Sentence corpus - test part} 7 | \format{2 data frame with 3117 rows and 2 variables: 8 | \describe{ 9 | \item{text}{the sentences as a character vector} 10 | \item{class.text}{the category of the sentence} 11 | }} 12 | \source{ 13 | \url{https://archive.ics.uci.edu/ml/index.php} 14 | } 15 | \usage{ 16 | test_sentences 17 | } 18 | \description{ 19 | This corpus contains sentences from 20 | the abstract and introduction of 30 scientific articles that have been 21 | annotated (i.e. labeled or tagged) according to a modified version of the 22 | Argumentative Zones annotation scheme. 23 | } 24 | \details{ 25 | These 30 scientific articles come 26 | from three different domains: 27 | \enumerate{ 28 | \item PLoS Computational Biology (PLOS) 29 | \item The machine learning repository on arXiv (ARXIV) 30 | \item The psychology journal Judgment and Decision Making (JDM) 31 | } 32 | 33 | There are 10 articles from each domain. In addition to the labeled data, this 34 | corpus also contains a corresponding set of unlabeled articles. These unlabeled 35 | articles also come from PLOS, ARXIV, and JDM. There are 300 unlabeled articles 36 | from each domain (again, only the sentences from the abstract and 37 | introduction). These unlabeled articles can be used for unsupervised or 38 | semi-supervised approaches to sentence classification which rely on a small set 39 | of labeled data and a larger set of unlabeled data. 40 | 41 | ===== References ===== 42 | 43 | S. Teufel and M. Moens. Summarizing scientific articles: experiments with 44 | relevance and rhetorical status. Computational Linguistics, 28(4):409-445, 45 | 2002. 46 | 47 | S. Teufel. Argumentative zoning: information extraction from scientific 48 | text. PhD thesis, School of Informatics, University of Edinburgh, 1999. 49 | } 50 | \keyword{datasets} 51 | -------------------------------------------------------------------------------- /man/train_sentences.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{train_sentences} 5 | \alias{train_sentences} 6 | \title{Sentence corpus - train part} 7 | \format{2 data frame with 3117 rows and 2 variables: 8 | \describe{ 9 | \item{text}{the sentences as a character vector} 10 | \item{class.text}{the category of the sentence} 11 | }} 12 | \source{ 13 | \url{https://archive.ics.uci.edu/ml/index.php} 14 | } 15 | \usage{ 16 | train_sentences 17 | } 18 | \description{ 19 | This corpus contains sentences from 20 | the abstract and introduction of 30 scientific articles that have been 21 | annotated (i.e. labeled or tagged) according to a modified version of the 22 | Argumentative Zones annotation scheme. 23 | } 24 | \details{ 25 | These 30 scientific articles come 26 | from three different domains: 27 | \enumerate{ 28 | \item PLoS Computational Biology (PLOS) 29 | \item The machine learning repository on arXiv (ARXIV) 30 | \item The psychology journal Judgment and Decision Making (JDM) 31 | } 32 | 33 | There are 10 articles from each domain. In addition to the labeled data, this 34 | corpus also contains a corresponding set of unlabeled articles. These unlabeled 35 | articles also come from PLOS, ARXIV, and JDM. There are 300 unlabeled articles 36 | from each domain (again, only the sentences from the abstract and 37 | introduction). These unlabeled articles can be used for unsupervised or 38 | semi-supervised approaches to sentence classification which rely on a small set 39 | of labeled data and a larger set of unlabeled data. 40 | 41 | ===== References ===== 42 | 43 | S. Teufel and M. Moens. Summarizing scientific articles: experiments with 44 | relevance and rhetorical status. Computational Linguistics, 28(4):409-445, 45 | 2002. 46 | 47 | S. Teufel. Argumentative zoning: information extraction from scientific 48 | text. PhD thesis, School of Informatics, University of Edinburgh, 1999. 49 | } 50 | \keyword{datasets} 51 | -------------------------------------------------------------------------------- /src/fasttext/meter.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | 14 | #include "dictionary.h" 15 | #include "real.h" 16 | #include "utils.h" 17 | 18 | namespace fasttext { 19 | 20 | class Meter { 21 | struct Metrics { 22 | uint64_t gold; 23 | uint64_t predicted; 24 | uint64_t predictedGold; 25 | mutable std::vector> scoreVsTrue; 26 | 27 | Metrics() : gold(0), predicted(0), predictedGold(0) {} 28 | 29 | double precision() const { 30 | if (predicted == 0) { 31 | return std::numeric_limits::quiet_NaN(); 32 | } 33 | return predictedGold / double(predicted); 34 | } 35 | double recall() const { 36 | if (gold == 0) { 37 | return std::numeric_limits::quiet_NaN(); 38 | } 39 | return predictedGold / double(gold); 40 | } 41 | double f1Score() const { 42 | if (predicted + gold == 0) { 43 | return std::numeric_limits::quiet_NaN(); 44 | } 45 | return 2 * predictedGold / double(predicted + gold); 46 | } 47 | }; 48 | 49 | public: 50 | Meter() : metrics_(), nexamples_(0), labelMetrics_() {} 51 | 52 | void log(const std::vector& labels, const Predictions& predictions); 53 | 54 | double precision(int32_t); 55 | double recall(int32_t); 56 | double f1Score(int32_t); 57 | double precision() const; 58 | double recall() const; 59 | double f1Score() const; 60 | uint64_t nexamples() const { 61 | return nexamples_; 62 | } 63 | void writeGeneralMetrics(std::ostream& out, int32_t k) const; 64 | 65 | private: 66 | Metrics metrics_{}; 67 | uint64_t nexamples_; 68 | std::unordered_map labelMetrics_; 69 | }; 70 | 71 | } // namespace fasttext 72 | -------------------------------------------------------------------------------- /src/fasttext/model.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "matrix.h" 17 | #include "real.h" 18 | #include "utils.h" 19 | #include "vector.h" 20 | 21 | namespace fasttext { 22 | 23 | class Loss; 24 | 25 | class Model { 26 | protected: 27 | std::shared_ptr wi_; 28 | std::shared_ptr wo_; 29 | std::shared_ptr loss_; 30 | bool normalizeGradient_; 31 | 32 | public: 33 | Model( 34 | std::shared_ptr wi, 35 | std::shared_ptr wo, 36 | std::shared_ptr loss, 37 | bool normalizeGradient); 38 | Model(const Model& model) = delete; 39 | Model(Model&& model) = delete; 40 | Model& operator=(const Model& other) = delete; 41 | Model& operator=(Model&& other) = delete; 42 | 43 | class State { 44 | private: 45 | real lossValue_; 46 | int64_t nexamples_; 47 | 48 | public: 49 | Vector hidden; 50 | Vector output; 51 | Vector grad; 52 | std::minstd_rand rng; 53 | 54 | State(int32_t hiddenSize, int32_t outputSize, int32_t seed); 55 | real getLoss() const; 56 | void incrementNExamples(real loss); 57 | }; 58 | 59 | void predict( 60 | const std::vector& input, 61 | int32_t k, 62 | real threshold, 63 | Predictions& heap, 64 | State& state) const; 65 | void update( 66 | const std::vector& input, 67 | const std::vector& targets, 68 | int32_t targetIndex, 69 | real lr, 70 | State& state); 71 | void computeHidden(const std::vector& input, State& state) const; 72 | 73 | real std_log(real) const; 74 | 75 | static const int32_t kUnlimitedPredictions = -1; 76 | static const int32_t kAllLabelsAsTarget = -1; 77 | }; 78 | 79 | } // namespace fasttext 80 | -------------------------------------------------------------------------------- /vignettes/supervised_learning.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Supervised learning" 3 | author: "M. Benesty" 4 | output: rmarkdown::html_vignette 5 | date: "`r Sys.Date()`" 6 | vignette: > 7 | %\VignetteEngine{knitr::rmarkdown} 8 | %\VignetteIndexEntry{Supervised learning} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r supervised_learning} 13 | library(fastrtext) 14 | 15 | data("train_sentences") 16 | data("test_sentences") 17 | 18 | # prepare data 19 | tmp_file_model <- tempfile() 20 | 21 | train_labels <- paste0("__label__", train_sentences[,"class.text"]) 22 | train_texts <- tolower(train_sentences[,"text"]) 23 | train_to_write <- paste(train_labels, train_texts) 24 | train_tmp_file_txt <- tempfile() 25 | writeLines(text = train_to_write, con = train_tmp_file_txt) 26 | 27 | test_labels <- paste0("__label__", test_sentences[,"class.text"]) 28 | test_labels_without_prefix <- test_sentences[,"class.text"] 29 | test_texts <- tolower(test_sentences[,"text"]) 30 | test_to_write <- paste(test_labels, test_texts) 31 | 32 | # learn model 33 | execute(commands = c("supervised", "-input", train_tmp_file_txt, "-output", tmp_file_model, "-dim", 20, "-lr", 1, "-epoch", 20, "-wordNgrams", 2, "-verbose", 1)) 34 | 35 | # load model 36 | model <- load_model(tmp_file_model) 37 | 38 | # prediction are returned as a list with words and probabilities 39 | predictions <- predict(model, sentences = test_to_write) 40 | print(head(predictions, 5)) 41 | 42 | # Compute accuracy 43 | mean(names(unlist(predictions)) == test_labels_without_prefix) 44 | 45 | # because there is only one category by observation, hamming loss will be the same 46 | get_hamming_loss(as.list(test_labels_without_prefix), predictions) 47 | 48 | # test predictions 49 | predictions <- predict(model, sentences = test_to_write) 50 | print(head(predictions, 5)) 51 | 52 | # you can get flat list of results when you are retrieving only one label per observation 53 | print(head(predict(model, sentences = test_to_write, simplify = TRUE))) 54 | 55 | # free memory 56 | unlink(train_tmp_file_txt) 57 | unlink(tmp_file_model) 58 | rm(model) 59 | gc() 60 | ``` 61 | -------------------------------------------------------------------------------- /src/add_prefix.cpp: -------------------------------------------------------------------------------- 1 | // [[Rcpp::plugins("cpp11")]] 2 | // [[Rcpp::interfaces(r, cpp)]] 3 | 4 | #include 5 | using namespace Rcpp; 6 | 7 | std::string add_pr(const std::string& line, const std::string& prefix); 8 | 9 | //' Add a prefix to each word 10 | //' 11 | //' Add a custom prefix to each word of a a line to create different spaces. 12 | //' Code in C++ (efficient). 13 | //' 14 | //' @param texts a [character] containing the original text 15 | //' @param prefix unit [character] containing the prefix to add (length == 1) or [character] with same length than texts 16 | //' @return [character] with prefixed words. 17 | //' @examples 18 | //' add_prefix(c("this is a test", "this is another test"), "#") 19 | //' @export 20 | // [[Rcpp::export]] 21 | CharacterVector add_prefix(const CharacterVector& texts, CharacterVector prefix) { 22 | 23 | const bool unique_prefix = prefix.size() == 1; 24 | 25 | if (!unique_prefix && prefix.size() != texts.size()) { 26 | stop("prefix should be a single string or the same size than text"); 27 | } 28 | 29 | std::string current_prefix; 30 | 31 | if (unique_prefix) { 32 | current_prefix = as(prefix[0]); 33 | } 34 | 35 | CharacterVector result(texts.size()); 36 | 37 | for (R_len_t i = 0; i < texts.size(); ++i) { 38 | if (!unique_prefix) { 39 | current_prefix = as(prefix[i]); 40 | } 41 | result[i] = add_pr(as(texts[i]), current_prefix); 42 | } 43 | return result; 44 | } 45 | 46 | // [[Rcpp::export]] 47 | std::string add_pr(const std::string& line, const std::string& prefix) { 48 | if (line.size() % 10 == 0) checkUserInterrupt(); 49 | 50 | std::string result; 51 | result.reserve(line.size() * 1.5); 52 | 53 | bool last_char_is_space = true; 54 | bool current_char_is_space; 55 | for (const char& current_char: line) { 56 | current_char_is_space = (current_char == ' ') | (current_char == '\t'); 57 | if (last_char_is_space && !current_char_is_space) { 58 | result += prefix; 59 | } 60 | 61 | last_char_is_space = current_char_is_space; 62 | result += current_char; 63 | } 64 | return result; 65 | } 66 | -------------------------------------------------------------------------------- /src/fasttext/meter.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "meter.h" 10 | #include "utils.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace fasttext { 18 | 19 | void Meter::log( 20 | const std::vector& labels, 21 | const Predictions& predictions) { 22 | nexamples_++; 23 | metrics_.gold += labels.size(); 24 | metrics_.predicted += predictions.size(); 25 | 26 | for (const auto& prediction : predictions) { 27 | labelMetrics_[prediction.second].predicted++; 28 | 29 | real score = std::exp(prediction.first); 30 | real gold = 0.0; 31 | if (utils::contains(labels, prediction.second)) { 32 | labelMetrics_[prediction.second].predictedGold++; 33 | metrics_.predictedGold++; 34 | gold = 1.0; 35 | } 36 | labelMetrics_[prediction.second].scoreVsTrue.emplace_back(score, gold); 37 | } 38 | 39 | for (const auto& label : labels) { 40 | labelMetrics_[label].gold++; 41 | } 42 | } 43 | 44 | double Meter::precision(int32_t i) { 45 | return labelMetrics_[i].precision(); 46 | } 47 | 48 | double Meter::recall(int32_t i) { 49 | return labelMetrics_[i].recall(); 50 | } 51 | 52 | double Meter::f1Score(int32_t i) { 53 | return labelMetrics_[i].f1Score(); 54 | } 55 | 56 | double Meter::precision() const { 57 | return metrics_.precision(); 58 | } 59 | 60 | double Meter::recall() const { 61 | return metrics_.recall(); 62 | } 63 | 64 | double Meter::f1Score() const { 65 | const double precision = this->precision(); 66 | const double recall = this->recall(); 67 | if (precision + recall != 0) { 68 | return 2 * precision * recall / (precision + recall); 69 | } 70 | return std::numeric_limits::quiet_NaN(); 71 | } 72 | 73 | void Meter::writeGeneralMetrics(std::ostream& out, int32_t k) const { 74 | out << "N" 75 | << "\t" << nexamples_ << std::endl; 76 | out << std::setprecision(3); 77 | out << "P@" << k << "\t" << metrics_.precision() << std::endl; 78 | out << "R@" << k << "\t" << metrics_.recall() << std::endl; 79 | } 80 | 81 | } // namespace fasttext 82 | -------------------------------------------------------------------------------- /man/build_vectors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{build_vectors} 4 | \alias{build_vectors} 5 | \title{Build fasttext vectors} 6 | \usage{ 7 | build_vectors(documents, model_path, modeltype = c("skipgram", "cbow"), 8 | bucket = 2e+06, dim = 100, epoch = 5, label = "__label__", 9 | loss = c("ns", "hs", "softmax", "ova", "one-vs-all"), lr = 0.05, 10 | lrUpdateRate = 100, maxn = 6, minCount = 5, minn = 3, neg = 5, 11 | t = 1e-04, thread = 12, verbose = 2, wordNgrams = 1, ws = 5) 12 | } 13 | \arguments{ 14 | \item{documents}{character vector of documents used for training} 15 | 16 | \item{model_path}{Name of output file \emph{without} file extension.} 17 | 18 | \item{modeltype}{Should training be done using skipgram or cbow? Defaults to skipgram.} 19 | 20 | \item{bucket}{number of buckets} 21 | 22 | \item{dim}{size of word vectors} 23 | 24 | \item{epoch}{number of epochs} 25 | 26 | \item{label}{text string, labels prefix. Default is "\strong{label}"} 27 | 28 | \item{loss}{loss function {ns, hs, softmax}} 29 | 30 | \item{lr}{learning rate} 31 | 32 | \item{lrUpdateRate}{change the rate of updates for the learning rate} 33 | 34 | \item{maxn}{max length of char ngram} 35 | 36 | \item{minCount}{minimal number of word occurences} 37 | 38 | \item{minn}{min length of char ngram} 39 | 40 | \item{neg}{number of negatives sampled} 41 | 42 | \item{t}{sampling threshold} 43 | 44 | \item{thread}{number of threads} 45 | 46 | \item{verbose}{verbosity level} 47 | 48 | \item{wordNgrams}{max length of word ngram} 49 | 50 | \item{ws}{size of the context window} 51 | } 52 | \value{ 53 | path to model file, as character 54 | } 55 | \description{ 56 | Trains a fasttext vector/unsupervised model following method described in 57 | \href{https://arxiv.org/abs/1607.04606}{Enriching Word Vectors with Subword Information} 58 | using the \href{https://fasttext.cc/}{fasttext} implementation. 59 | 60 | See \href{https://fasttext.cc/docs/en/unsupervised-tutorial.html}{FastText word representation tutorial} for more information on 61 | training unsupervised models using fasttext. 62 | } 63 | \examples{ 64 | \dontrun{ 65 | library(fastrtext) 66 | text <- train_sentences 67 | model_file <- build_vectors(text[['text']], 'my_model') 68 | model <- load_model(model_file) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # 0.3.4 (10/27/19) 2 | 3 | * remove deprecated code to fix Cran warnings 4 | * update to last FastText version 5 | * support one vs all loss 6 | * less macro (remove possibility to stop learning with CTRL+C) 7 | 8 | # 0.3.2 (10.04.19) 9 | 10 | * following Cran email, remove debug symbols stripping 11 | * make unit tests stronger 12 | 13 | # 0.3.1 (07.03.19) 14 | 15 | * update the C++ source code to the current fassttext version 16 | * remove analogies function 17 | * update error message 18 | * add function to add prefix to words (to create different spaces in the same dataset, useful for classification in particular) 19 | * simplify tests 20 | 21 | # 0.2.6 (31.1.19) 22 | 23 | * use -pthread flag for better Cran compliancy 24 | 25 | # 0.2.5 (4.1.18) 26 | 27 | * add get word id function 28 | * add tokenizer function 29 | * change the way sentence vector are computed (use fastText internal code to improve speed) 30 | * remove RcppThread due to change in FB source code (no more print from multiple at the same time thread) 31 | * add possibility to interrupte learning 32 | 33 | # 0.2.4 (9.12.17) 34 | 35 | * major refactoring 36 | * update to last version of fastText sourcecode 37 | * sentence representation function 38 | * add tags function 39 | * fix compilation on Windows R Dev 40 | * better Makevars (related to strippedLib task) 41 | 42 | # 0.2.3 (9.11.17) 43 | 44 | * fix a cran note related to the DESCRIPTION file 45 | * remove documentation not anymore useful because of previous update 46 | * add some asserts to avoid the case where some sentences have no prediction because all their words are unknown (not seen during training) 47 | * fix compilation on Mac OS 48 | 49 | # 0.2.2 (07.11.17) 50 | 51 | * make possible to interrupt long computation (not for model training part) 52 | * add simplify option to predict (to get flat vector as a result) 53 | * remove prefix label in predict result 54 | * update fastText source code 55 | * fix crash when learning and setting verbose to 2 (calling Rcout from multiple threads crash the application) 56 | 57 | # 0.2.1 (18.09.17) 58 | 59 | * fix small bugs in compilation (mostly for mac os) 60 | * remove all notes (Cran) 61 | 62 | # 0.2.0 (15.09.17) 63 | 64 | * first Cran release 65 | * covers all basic features of fastText 66 | -------------------------------------------------------------------------------- /vignettes/list_commands.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "List of commands" 3 | author: "M. Benesty" 4 | output: rmarkdown::html_vignette 5 | date: "`r Sys.Date()`" 6 | vignette: > 7 | %\VignetteEngine{knitr::rmarkdown} 8 | %\VignetteIndexEntry{List of commands} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | As seen in the other tutorials, `execute()` function works in a similar way to the command line client. 13 | To list the commands, you just need to enter the following function: 14 | 15 | ```{R eval=FALSE} 16 | library(fastrtext) 17 | 18 | print_help() 19 | ``` 20 | 21 | ``` 22 | The following arguments are mandatory: 23 | -input training file path 24 | -output output file path 25 | 26 | The following arguments are optional: 27 | -verbose verbosity level [2] 28 | 29 | The following arguments for the dictionary are optional: 30 | -minCount minimal number of word occurrences [1] 31 | -minCountLabel minimal number of label occurrences [0] 32 | -wordNgrams max length of word ngram [1] 33 | -bucket number of buckets [2000000] 34 | -minn min length of char ngram [0] 35 | -maxn max length of char ngram [0] 36 | -t sampling threshold [0.0001] 37 | -label labels prefix [__label__] 38 | 39 | The following arguments for training are optional: 40 | -lr learning rate [0.1] 41 | -lrUpdateRate change the rate of updates for the learning rate [100] 42 | -dim size of word vectors [100] 43 | -ws size of the context window [5] 44 | -epoch number of epochs [5] 45 | -neg number of negatives sampled [5] 46 | -loss loss function {ns, hs, softmax} [softmax] 47 | -thread number of threads [12] 48 | -pretrainedVectors pretrained word vectors for supervised learning [] 49 | -saveOutput whether output params should be saved [0] 50 | 51 | The following arguments for quantization are optional: 52 | -cutoff number of words and ngrams to retain [0] 53 | -retrain finetune embeddings if a cutoff is applied [0] 54 | -qnorm quantizing the norm separately [0] 55 | -qout quantizing the classifier [0] 56 | -dsub size of each sub-vector [2] 57 | ``` 58 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /src/fasttext/vector.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "vector.h" 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | #include "matrix.h" 17 | 18 | namespace fasttext { 19 | 20 | Vector::Vector(int64_t m) : data_(m) {} 21 | 22 | void Vector::zero() { 23 | std::fill(data_.begin(), data_.end(), 0.0); 24 | } 25 | 26 | real Vector::norm() const { 27 | real sum = 0; 28 | for (int64_t i = 0; i < size(); i++) { 29 | sum += data_[i] * data_[i]; 30 | } 31 | return std::sqrt(sum); 32 | } 33 | 34 | void Vector::mul(real a) { 35 | for (int64_t i = 0; i < size(); i++) { 36 | data_[i] *= a; 37 | } 38 | } 39 | 40 | void Vector::addVector(const Vector& source) { 41 | assert(size() == source.size()); 42 | for (int64_t i = 0; i < size(); i++) { 43 | data_[i] += source.data_[i]; 44 | } 45 | } 46 | 47 | void Vector::addVector(const Vector& source, real s) { 48 | assert(size() == source.size()); 49 | for (int64_t i = 0; i < size(); i++) { 50 | data_[i] += s * source.data_[i]; 51 | } 52 | } 53 | 54 | void Vector::addRow(const Matrix& A, int64_t i, real a) { 55 | assert(i >= 0); 56 | assert(i < A.size(0)); 57 | assert(size() == A.size(1)); 58 | A.addRowToVector(*this, i, a); 59 | } 60 | 61 | void Vector::addRow(const Matrix& A, int64_t i) { 62 | assert(i >= 0); 63 | assert(i < A.size(0)); 64 | assert(size() == A.size(1)); 65 | A.addRowToVector(*this, i); 66 | } 67 | 68 | void Vector::mul(const Matrix& A, const Vector& vec) { 69 | assert(A.size(0) == size()); 70 | assert(A.size(1) == vec.size()); 71 | for (int64_t i = 0; i < size(); i++) { 72 | data_[i] = A.dotRow(vec, i); 73 | } 74 | } 75 | 76 | int64_t Vector::argmax() { 77 | real max = data_[0]; 78 | int64_t argmax = 0; 79 | for (int64_t i = 1; i < size(); i++) { 80 | if (data_[i] > max) { 81 | max = data_[i]; 82 | argmax = i; 83 | } 84 | } 85 | return argmax; 86 | } 87 | 88 | std::ostream& operator<<(std::ostream& os, const Vector& v) { 89 | os << std::setprecision(5); 90 | for (int64_t j = 0; j < v.size(); j++) { 91 | os << v[j] << ' '; 92 | } 93 | return os; 94 | } 95 | 96 | } // namespace fasttext 97 | -------------------------------------------------------------------------------- /src/fasttext/args.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace fasttext { 18 | 19 | enum class model_name : int { cbow = 1, sg, sup }; 20 | enum class loss_name : int { hs = 1, ns, softmax, ova }; 21 | enum class metric_name : int { f1score = 1, labelf1score }; 22 | 23 | class Args { 24 | protected: 25 | std::string boolToString(bool) const; 26 | std::string modelToString(model_name) const; 27 | std::string metricToString(metric_name) const; 28 | std::unordered_set manualArgs_; 29 | 30 | public: 31 | Args(); 32 | std::string input; 33 | std::string output; 34 | double lr; 35 | int lrUpdateRate; 36 | int dim; 37 | int ws; 38 | int epoch; 39 | int minCount; 40 | int minCountLabel; 41 | int neg; 42 | int wordNgrams; 43 | loss_name loss; 44 | model_name model; 45 | int bucket; 46 | int minn; 47 | int maxn; 48 | int thread; 49 | double t; 50 | std::string label; 51 | int verbose; 52 | std::string pretrainedVectors; 53 | bool saveOutput; 54 | int seed; 55 | 56 | bool qout; 57 | bool retrain; 58 | bool qnorm; 59 | size_t cutoff; 60 | size_t dsub; 61 | 62 | std::string autotuneValidationFile; 63 | std::string autotuneMetric; 64 | int autotunePredictions; 65 | int autotuneDuration; 66 | std::string autotuneModelSize; 67 | 68 | void parseArgs(const std::vector& args); 69 | void printHelp(); 70 | void printBasicHelp(); 71 | void printDictionaryHelp(); 72 | void printTrainingHelp(); 73 | void printAutotuneHelp(); 74 | void printQuantizationHelp(); 75 | void save(std::ostream&); 76 | void load(std::istream&); 77 | void dump(std::ostream&) const; 78 | bool hasAutotune() const; 79 | bool isManual(const std::string& argName) const; 80 | void setManual(const std::string& argName); 81 | std::string lossToString(loss_name) const; 82 | metric_name getAutotuneMetric() const; 83 | std::string getAutotuneMetricLabel() const; 84 | int64_t getAutotuneModelSize() const; 85 | 86 | static constexpr double kUnlimitedModelSize = -1.0; 87 | }; 88 | } // namespace fasttext 89 | -------------------------------------------------------------------------------- /src/fasttext/densematrix.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "matrix.h" 19 | #include "real.h" 20 | 21 | namespace fasttext { 22 | 23 | class Vector; 24 | 25 | class DenseMatrix : public Matrix { 26 | protected: 27 | std::vector data_; 28 | void uniformThread(real, int, int32_t); 29 | 30 | public: 31 | DenseMatrix(); 32 | explicit DenseMatrix(int64_t, int64_t); 33 | DenseMatrix(const DenseMatrix&) = default; 34 | DenseMatrix(DenseMatrix&&) noexcept; 35 | DenseMatrix& operator=(const DenseMatrix&) = delete; 36 | DenseMatrix& operator=(DenseMatrix&&) = delete; 37 | virtual ~DenseMatrix() noexcept override = default; 38 | 39 | inline real* data() { 40 | return data_.data(); 41 | } 42 | inline const real* data() const { 43 | return data_.data(); 44 | } 45 | 46 | inline const real& at(int64_t i, int64_t j) const { 47 | assert(i * n_ + j < data_.size()); 48 | return data_[i * n_ + j]; 49 | }; 50 | inline real& at(int64_t i, int64_t j) { 51 | return data_[i * n_ + j]; 52 | }; 53 | 54 | inline int64_t rows() const { 55 | return m_; 56 | } 57 | inline int64_t cols() const { 58 | return n_; 59 | } 60 | void zero(); 61 | void uniform(real, unsigned int, int32_t); 62 | 63 | void multiplyRow(const Vector& nums, int64_t ib = 0, int64_t ie = -1); 64 | void divideRow(const Vector& denoms, int64_t ib = 0, int64_t ie = -1); 65 | 66 | real l2NormRow(int64_t i) const; 67 | void l2NormRow(Vector& norms) const; 68 | 69 | real dotRow(const Vector&, int64_t) const override; 70 | void addVectorToRow(const Vector&, int64_t, real) override; 71 | void addRowToVector(Vector& x, int32_t i) const override; 72 | void addRowToVector(Vector& x, int32_t i, real a) const override; 73 | void save(std::ostream&) const override; 74 | void load(std::istream&) override; 75 | void dump(std::ostream&) const override; 76 | 77 | class EncounteredNaNError : public std::runtime_error { 78 | public: 79 | EncounteredNaNError() : std::runtime_error("Encountered NaN.") {} 80 | }; 81 | }; 82 | } // namespace fasttext 83 | -------------------------------------------------------------------------------- /src/fasttext/model.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "model.h" 10 | #include "loss.h" 11 | #include "utils.h" 12 | 13 | #include 14 | #include 15 | 16 | namespace fasttext { 17 | 18 | Model::State::State(int32_t hiddenSize, int32_t outputSize, int32_t seed) 19 | : lossValue_(0.0), 20 | nexamples_(0), 21 | hidden(hiddenSize), 22 | output(outputSize), 23 | grad(hiddenSize), 24 | rng(seed) {} 25 | 26 | real Model::State::getLoss() const { 27 | return lossValue_ / nexamples_; 28 | } 29 | 30 | void Model::State::incrementNExamples(real loss) { 31 | lossValue_ += loss; 32 | nexamples_++; 33 | } 34 | 35 | Model::Model( 36 | std::shared_ptr wi, 37 | std::shared_ptr wo, 38 | std::shared_ptr loss, 39 | bool normalizeGradient) 40 | : wi_(wi), wo_(wo), loss_(loss), normalizeGradient_(normalizeGradient) {} 41 | 42 | void Model::computeHidden(const std::vector& input, State& state) 43 | const { 44 | Vector& hidden = state.hidden; 45 | hidden.zero(); 46 | for (auto it = input.cbegin(); it != input.cend(); ++it) { 47 | hidden.addRow(*wi_, *it); 48 | } 49 | hidden.mul(1.0 / input.size()); 50 | } 51 | 52 | void Model::predict( 53 | const std::vector& input, 54 | int32_t k, 55 | real threshold, 56 | Predictions& heap, 57 | State& state) const { 58 | if (k == Model::kUnlimitedPredictions) { 59 | k = wo_->size(0); // output size 60 | } else if (k <= 0) { 61 | throw std::invalid_argument("k needs to be 1 or higher!"); 62 | } 63 | heap.reserve(k + 1); 64 | computeHidden(input, state); 65 | 66 | loss_->predict(k, threshold, heap, state); 67 | } 68 | 69 | void Model::update( 70 | const std::vector& input, 71 | const std::vector& targets, 72 | int32_t targetIndex, 73 | real lr, 74 | State& state) { 75 | if (input.size() == 0) { 76 | return; 77 | } 78 | computeHidden(input, state); 79 | 80 | Vector& grad = state.grad; 81 | grad.zero(); 82 | real lossValue = loss_->forward(targets, targetIndex, state, lr, true); 83 | state.incrementNExamples(lossValue); 84 | 85 | if (normalizeGradient_) { 86 | grad.mul(1.0 / input.size()); 87 | } 88 | for (auto it = input.cbegin(); it != input.cend(); ++it) { 89 | wi_->addVectorToRow(grad, *it, 1.0); 90 | } 91 | } 92 | 93 | real Model::std_log(real x) const { 94 | return std::log(x + 1e-5); 95 | } 96 | 97 | } // namespace fasttext 98 | -------------------------------------------------------------------------------- /src/fasttext/autotune.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "args.h" 18 | #include "fasttext.h" 19 | 20 | namespace fasttext { 21 | 22 | class AutotuneStrategy { 23 | private: 24 | Args bestArgs_; 25 | int maxDuration_; 26 | std::minstd_rand rng_; 27 | int trials_; 28 | int bestMinnIndex_; 29 | int bestDsubExponent_; 30 | int bestNonzeroBucket_; 31 | std::vector minnChoices_; 32 | int getIndex(int val, const std::vector& choices); 33 | 34 | public: 35 | explicit AutotuneStrategy( 36 | const Args& args, 37 | std::minstd_rand::result_type seed); 38 | Args ask(double elapsed); 39 | void updateBest(const Args& args); 40 | }; 41 | 42 | class Autotune { 43 | protected: 44 | std::shared_ptr fastText_; 45 | double elapsed_; 46 | double bestScore_; 47 | int32_t trials_; 48 | int32_t sizeConstraintFailed_; 49 | std::atomic continueTraining_; 50 | std::unique_ptr strategy_; 51 | std::thread timer_; 52 | 53 | bool keepTraining(double maxDuration) const; 54 | void printInfo(double maxDuration); 55 | void timer( 56 | const std::chrono::steady_clock::time_point& start, 57 | double maxDuration); 58 | void abort(); 59 | void startTimer(const Args& args); 60 | double getMetricScore( 61 | Meter& meter, 62 | const metric_name& metricName, 63 | const std::string& metricLabel) const; 64 | void printArgs(const Args& args, const Args& autotuneArgs); 65 | void printSkippedArgs(const Args& autotuneArgs); 66 | bool quantize(Args& args, const Args& autotuneArgs); 67 | int getCutoffForFileSize(bool qout, bool qnorm, int dsub, int64_t fileSize) 68 | const; 69 | 70 | class TimeoutError : public std::runtime_error { 71 | public: 72 | TimeoutError() : std::runtime_error("Autotune timed out.") {} 73 | }; 74 | 75 | static constexpr double kUnknownBestScore = -1.0; 76 | static constexpr int kCutoffLimit = 256; 77 | 78 | public: 79 | Autotune() = delete; 80 | explicit Autotune(const std::shared_ptr& fastText); 81 | Autotune(const Autotune&) = delete; 82 | Autotune(Autotune&&) = delete; 83 | Autotune& operator=(const Autotune&) = delete; 84 | Autotune& operator=(Autotune&&) = delete; 85 | ~Autotune() noexcept = default; 86 | 87 | void train(const Args& args); 88 | }; 89 | 90 | } // namespace fasttext 91 | -------------------------------------------------------------------------------- /man/build_supervised.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/API.R 3 | \name{build_supervised} 4 | \alias{build_supervised} 5 | \title{Build a supervised fasttext model} 6 | \usage{ 7 | build_supervised(documents, targets, model_path, lr = 0.05, dim = 100, 8 | ws = 5, epoch = 5, minCount = 5, minCountLabel = 0, neg = 5, 9 | wordNgrams = 1, loss = c("ns", "hs", "softmax", "ova", "one-vs-all"), 10 | bucket = 2e+06, minn = 3, maxn = 6, thread = 12, 11 | lrUpdateRate = 100, t = 1e-04, label = "__label__", verbose = 2, 12 | pretrainedVectors = NULL) 13 | } 14 | \arguments{ 15 | \item{documents}{character vector of documents used for training} 16 | 17 | \item{targets}{vector of targets/catagory of each document. Must have same length as \code{documents} and be coercable to character} 18 | 19 | \item{model_path}{Name of output file \emph{without} file extension.} 20 | 21 | \item{lr}{learning rate} 22 | 23 | \item{dim}{size of word vectors} 24 | 25 | \item{ws}{size of the context window} 26 | 27 | \item{epoch}{number of epochs} 28 | 29 | \item{minCount}{minimal number of word occurences} 30 | 31 | \item{minCountLabel}{minimal number of label occurences} 32 | 33 | \item{neg}{number of negatives sampled} 34 | 35 | \item{wordNgrams}{max length of word ngram} 36 | 37 | \item{loss}{= c('softmax', 'ns', 'hs', 'ova'), loss function {ns, hs, softmax, one Vs all}. one Vs all loss is usefull for multi class when you need to apply a threshold for each class score.} 38 | 39 | \item{bucket}{number of buckets} 40 | 41 | \item{minn}{min length of char ngram} 42 | 43 | \item{maxn}{max length of char ngram} 44 | 45 | \item{thread}{number of threads} 46 | 47 | \item{lrUpdateRate}{change the rate of updates for the learning rate} 48 | 49 | \item{t}{sampling threshold} 50 | 51 | \item{label}{text string, labels prefix. Default is "\strong{label}"} 52 | 53 | \item{verbose}{verbosity level} 54 | 55 | \item{pretrainedVectors}{path to pretrained word vectors for supervised learning. Leave empty for no pretrained vectors.} 56 | } 57 | \value{ 58 | path to new model file as a \code{character} 59 | } 60 | \description{ 61 | Trains a supervised model, following the method layed out in 62 | \href{https://arxiv.org/abs/1607.01759}{Bag of Tricks for Efficient Text Classification} 63 | using the \href{https://fasttext.cc/}{fasttext} implementation. 64 | 65 | See \href{https://fasttext.cc/docs/en/supervised-tutorial.html}{FastText text classification tutorial} for more information on 66 | training supervised models using fasttext. 67 | } 68 | \examples{ 69 | \dontrun{ 70 | library(fastrtext) 71 | model_file <- build_supervised(documents = train_sentences[["text"]], 72 | targets =train_sentences[["class.text"]], 73 | model_path = 'my_model', 74 | dim = 20, lr = 1, epoch = 20, wordNgrams = 2) 75 | 76 | model <- load_model(model_file) 77 | 78 | predictions <- predict(model, test_sentences[["text"]]) 79 | mean(sapply(predictions, names) == test_sentences[["class.text"]]) 80 | # ~0.8 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Version 0.3.4 2 | 3 | fix deprecated error messages. 4 | 5 | ## Version 0.3.3 6 | small update 7 | 8 | ## Version 0.3.2 9 | Following Cran e-mail (from Prof Brian Ripley), remove strip-debug in Makevars file. 10 | Now the size of the package is > 10Mb and it generates a Warning regarding its size. 11 | 12 | ``` 13 | Please remove unconditional stripping ASAP and before Apr 24 to safely 14 | retain the package on CRAN. 15 | ``` 16 | 17 | ## Version 0.3.1 18 | Following cran response, tests have been shortened by using multithreading during model training (before use only 1 thread) 19 | 20 | ## Version 0.3.0 21 | * big C++ code update 22 | * fix littl bug in catching C++ exception as reported by Cran server 23 | 24 | I had to re-upload the package as bug in a corner case appeared. 25 | I am sorry for that. 26 | 27 | ## Version 0.2.6 28 | * use -pthread flag for better Cran compliancy 29 | 30 | ## Version 0.2.3 - 08.10.17 - answer 31 | Cran: 32 | Thanks, we see that the Date field is over a month old. 33 | Is this the right version? 34 | 35 | Answer: 36 | This is the right version but I have made an error in the date. 37 | 38 | 39 | ## Version 0.2.3 - 08.10.17 40 | * fix Cran notes 41 | * fix a bug introduced in the last update 42 | 43 | ## Version 0.2.2 - 06.10.17 44 | * add a dependency to fix some very specific crash due to Rcout called from different threads. 45 | * fix compilation on Mac OS with RcppThread 46 | 47 | ## Version 0.2.1 - 18.09.17 48 | * Fix compilation crash on Mac OS 49 | * remove notes on R devel 50 | 51 | ## Comments from Swetlana Herbrandt - 15.09.17 - 4:45PM (French time) 52 | * please omit the redundant 'R' in your title -> the R is now removed from the title field. 53 | * please write package names and software names in Title and Description in single quotes (e.g. 'FastText'). -> quotes have been applied on any software name. 54 | * please add an URL for 'FastText' in the form or with angle brackets for auto-linking and no space after 'http:' and 'https:' -> the link has been added at the end of the decsription text. 55 | * we see code lines such as Copyright (c) 2016-present, Facebook, Inc. All rights reserved. Please add all authors and copyright holders in the Authors@R field with the appropriate roles. -> a new person Facebook, Inc. has been added, with the role cph 56 | 57 | Note to Cran 58 | ------------ 59 | The introduction of quotes (see above) has raised a new note: 60 | "The Description field should start with a capital letter." 61 | This is wanted. 62 | 63 | 64 | ## Test environments 65 | * Local ubuntu 17.04 + R version 3.4.1 66 | * R-Hub Cran check (Linux + Windows) 67 | * App veyor, Windows 68 | * Travis-CI, Linux 69 | 70 | ## R CMD check results 71 | * Local: no warning, no note 72 | * Travis-CI: no warning, no note 73 | * R-Hub Cran check: 74 | * 1 note: "Possibly mis-spelled words in DESCRIPTION" -> there is no error 75 | * App Veyor: 76 | * 1 note: "Found no calls to: 'R_registerRoutines', 'R_useDynamicSymbols'" -> App Veyor tool chain is known to not be up to date, it may be the cause of this note. The note can't be reproduced on R-Hub Windows check. 77 | -------------------------------------------------------------------------------- /inst/include/fastrtext_RcppExports.h: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #ifndef RCPP_fastrtext_RCPPEXPORTS_H_GEN_ 5 | #define RCPP_fastrtext_RCPPEXPORTS_H_GEN_ 6 | 7 | #include 8 | 9 | namespace fastrtext { 10 | 11 | using namespace Rcpp; 12 | 13 | namespace { 14 | void validateSignature(const char* sig) { 15 | Rcpp::Function require = Rcpp::Environment::base_env()["require"]; 16 | require("fastrtext", Rcpp::Named("quietly") = true); 17 | typedef int(*Ptr_validate)(const char*); 18 | static Ptr_validate p_validate = (Ptr_validate) 19 | R_GetCCallable("fastrtext", "_fastrtext_RcppExport_validate"); 20 | if (!p_validate(sig)) { 21 | throw Rcpp::function_not_exported( 22 | "C++ function with signature '" + std::string(sig) + "' not found in fastrtext"); 23 | } 24 | } 25 | } 26 | 27 | inline CharacterVector add_prefix(const CharacterVector& texts, CharacterVector prefix) { 28 | typedef SEXP(*Ptr_add_prefix)(SEXP,SEXP); 29 | static Ptr_add_prefix p_add_prefix = NULL; 30 | if (p_add_prefix == NULL) { 31 | validateSignature("CharacterVector(*add_prefix)(const CharacterVector&,CharacterVector)"); 32 | p_add_prefix = (Ptr_add_prefix)R_GetCCallable("fastrtext", "_fastrtext_add_prefix"); 33 | } 34 | RObject rcpp_result_gen; 35 | { 36 | RNGScope RCPP_rngScope_gen; 37 | rcpp_result_gen = p_add_prefix(Shield(Rcpp::wrap(texts)), Shield(Rcpp::wrap(prefix))); 38 | } 39 | if (rcpp_result_gen.inherits("interrupted-error")) 40 | throw Rcpp::internal::InterruptedException(); 41 | if (Rcpp::internal::isLongjumpSentinel(rcpp_result_gen)) 42 | throw Rcpp::LongjumpException(rcpp_result_gen); 43 | if (rcpp_result_gen.inherits("try-error")) 44 | throw Rcpp::exception(Rcpp::as(rcpp_result_gen).c_str()); 45 | return Rcpp::as(rcpp_result_gen); 46 | } 47 | 48 | inline std::string add_pr(const std::string& line, const std::string& prefix) { 49 | typedef SEXP(*Ptr_add_pr)(SEXP,SEXP); 50 | static Ptr_add_pr p_add_pr = NULL; 51 | if (p_add_pr == NULL) { 52 | validateSignature("std::string(*add_pr)(const std::string&,const std::string&)"); 53 | p_add_pr = (Ptr_add_pr)R_GetCCallable("fastrtext", "_fastrtext_add_pr"); 54 | } 55 | RObject rcpp_result_gen; 56 | { 57 | RNGScope RCPP_rngScope_gen; 58 | rcpp_result_gen = p_add_pr(Shield(Rcpp::wrap(line)), Shield(Rcpp::wrap(prefix))); 59 | } 60 | if (rcpp_result_gen.inherits("interrupted-error")) 61 | throw Rcpp::internal::InterruptedException(); 62 | if (Rcpp::internal::isLongjumpSentinel(rcpp_result_gen)) 63 | throw Rcpp::LongjumpException(rcpp_result_gen); 64 | if (rcpp_result_gen.inherits("try-error")) 65 | throw Rcpp::exception(Rcpp::as(rcpp_result_gen).c_str()); 66 | return Rcpp::as(rcpp_result_gen); 67 | } 68 | 69 | } 70 | 71 | #endif // RCPP_fastrtext_RCPPEXPORTS_H_GEN_ 72 | -------------------------------------------------------------------------------- /tests/testthat/test-unsupervised.R: -------------------------------------------------------------------------------- 1 | context("Unsupervised training") 2 | 3 | model_test_path <- system.file("extdata", 4 | "model_unsupervised_test.bin", 5 | package = "fastrtext") 6 | 7 | test_that("Training", { 8 | data("train_sentences") 9 | data("test_sentences") 10 | texts <- tolower(train_sentences[, "text"]) 11 | tmp_file_txt <- tempfile() 12 | tmp_file_model <- tempfile() 13 | writeLines(text = texts, con = tmp_file_txt) 14 | execute(commands = c("skipgram", 15 | "-input", tmp_file_txt, 16 | "-output", tmp_file_model, 17 | "-verbose", 0, 18 | "-dim", 10, 19 | "-bucket", 1e3, 20 | "-loss", "ns", 21 | "-epoch", 3)) 22 | 23 | # Check learned file exists 24 | expect_true(file.exists(paste0(tmp_file_model, ".bin"))) 25 | expect_true(file.exists(paste0(tmp_file_model, ".vec"))) 26 | 27 | model <- load_model(tmp_file_model) 28 | parameters <- get_parameters(model) 29 | expect_equal(parameters$model_name, "sg") 30 | 31 | build_vectors(documents = texts, 32 | model_path = tmp_file_model, 33 | modeltype = "skipgram", 34 | bucket = 1e3, 35 | dim = 10, 36 | epoch = 3, 37 | loss = "softmax", 38 | verbose = 0) 39 | 40 | }) 41 | 42 | test_that("Test parameter extraction", { 43 | model <- load_model(model_test_path) 44 | parameters <- get_parameters(model) 45 | expect_equal(parameters$dim, 70) 46 | expect_equal(parameters$model_name, "sg") 47 | }) 48 | 49 | test_that("Test word extraction and word IDs", { 50 | model <- load_model(model_test_path) 51 | dict <- get_dictionary(model) 52 | expect_length(dict, 2061) 53 | expect_true("time" %in% dict) 54 | expect_true("timing" %in% dict) 55 | expect_true("experience" %in% dict) 56 | expect_true("section" %in% dict) 57 | 58 | sentence_to_test <- c("this", "is", "a", "test") 59 | ids <- get_word_ids(model, sentence_to_test) 60 | expect_equal(get_dictionary(model)[ids], sentence_to_test) 61 | }) 62 | 63 | test_that("Tokenization separate words in a text document", { 64 | model <- load_model(model_test_path) 65 | tokens <- get_tokenized_text(model, "this is a test") 66 | expect_equal(tokens, list(c("this", "is", "a", "test"))) 67 | }) 68 | 69 | test_that("Test word embeddings", { 70 | model <- load_model(model_test_path) 71 | 72 | # test vector lentgh 73 | parameters <- get_parameters(model) 74 | expect_length(get_word_vectors(model, "time")[1, ], parameters$dim) 75 | 76 | # test word distance 77 | expect_lt(get_word_distance(model, "introduction", "conclusions"), 78 | get_word_distance(model, "experience", "section")) 79 | expect_lt(get_word_distance(model, "our", "we"), 80 | get_word_distance(model, "introduction", "conclusions")) 81 | }) 82 | 83 | test_that("Nearest neighbours", { 84 | model <- load_model(model_test_path) 85 | nn <- get_nn(model, "time", 10) 86 | expect_true("times" %in% names(nn)) 87 | }) 88 | 89 | test_that("Test sentence representation", { 90 | model <- load_model(model_test_path) 91 | m <- get_sentence_representation(model, "this is a test") 92 | expect_length(m, 70) 93 | expect_equal(nrow(m), 1) 94 | m <- get_sentence_representation(model, c("this is a test", "and here is another")) 95 | expect_equal(nrow(m), 2) 96 | expect_false(any(is.na(m))) 97 | }) 98 | 99 | gc() 100 | -------------------------------------------------------------------------------- /src/fasttext/dictionary.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "args.h" 20 | #include "real.h" 21 | 22 | namespace fasttext { 23 | 24 | typedef int32_t id_type; 25 | enum class entry_type : int8_t { word = 0, label = 1 }; 26 | 27 | struct entry { 28 | std::string word; 29 | int64_t count; 30 | entry_type type; 31 | std::vector subwords; 32 | }; 33 | 34 | class Dictionary { 35 | protected: 36 | static const int32_t MAX_VOCAB_SIZE = 30000000; 37 | static const int32_t MAX_LINE_SIZE = 1024; 38 | 39 | int32_t find(const std::string&) const; 40 | int32_t find(const std::string&, uint32_t h) const; 41 | void initTableDiscard(); 42 | void initNgrams(); 43 | void reset(std::istream&) const; 44 | void pushHash(std::vector&, int32_t) const; 45 | void addSubwords(std::vector&, const std::string&, int32_t) const; 46 | 47 | std::shared_ptr args_; 48 | std::vector word2int_; 49 | std::vector words_; 50 | 51 | std::vector pdiscard_; 52 | int32_t size_; 53 | int32_t nwords_; 54 | int32_t nlabels_; 55 | int64_t ntokens_; 56 | 57 | int64_t pruneidx_size_; 58 | std::unordered_map pruneidx_; 59 | void addWordNgrams( 60 | std::vector& line, 61 | const std::vector& hashes, 62 | int32_t n) const; 63 | 64 | public: 65 | static const std::string EOS; 66 | static const std::string BOW; 67 | static const std::string EOW; 68 | 69 | explicit Dictionary(std::shared_ptr); 70 | explicit Dictionary(std::shared_ptr, std::istream&); 71 | int32_t nwords() const; 72 | int32_t nlabels() const; 73 | int64_t ntokens() const; 74 | int32_t getId(const std::string&) const; 75 | int32_t getId(const std::string&, uint32_t h) const; 76 | entry_type getType(int32_t) const; 77 | entry_type getType(const std::string&) const; 78 | bool discard(int32_t, real) const; 79 | std::string getWord(int32_t) const; 80 | const std::vector& getSubwords(int32_t) const; 81 | const std::vector getSubwords(const std::string&) const; 82 | void getSubwords( 83 | const std::string&, 84 | std::vector&, 85 | std::vector&) const; 86 | void computeSubwords( 87 | const std::string&, 88 | std::vector&, 89 | std::vector* substrings = nullptr) const; 90 | uint32_t hash(const std::string& str) const; 91 | void add(const std::string&); 92 | bool readWord(std::istream&, std::string&) const; 93 | void readFromFile(std::istream&); 94 | std::string getLabel(int32_t) const; 95 | void save(std::ostream&) const; 96 | void load(std::istream&); 97 | std::vector getCounts(entry_type) const; 98 | int32_t getLine(std::istream&, std::vector&, std::vector&) 99 | const; 100 | int32_t getLine(std::istream&, std::vector&, std::minstd_rand&) 101 | const; 102 | void threshold(int64_t, int64_t); 103 | void prune(std::vector&); 104 | bool isPruned() { 105 | return pruneidx_size_ >= 0; 106 | } 107 | void dump(std::ostream&) const; 108 | void init(); 109 | }; 110 | 111 | } // namespace fasttext 112 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Sentence corpus - train part 2 | #' 3 | #' This corpus contains sentences from 4 | #' the abstract and introduction of 30 scientific articles that have been 5 | #' annotated (i.e. labeled or tagged) according to a modified version of the 6 | #' Argumentative Zones annotation scheme. 7 | #' 8 | #' These 30 scientific articles come 9 | #' from three different domains: 10 | #' 1. PLoS Computational Biology (PLOS) 11 | #' 2. The machine learning repository on arXiv (ARXIV) 12 | #' 3. The psychology journal Judgment and Decision Making (JDM) 13 | #' 14 | #' There are 10 articles from each domain. In addition to the labeled data, this 15 | #' corpus also contains a corresponding set of unlabeled articles. These unlabeled 16 | #' articles also come from PLOS, ARXIV, and JDM. There are 300 unlabeled articles 17 | #' from each domain (again, only the sentences from the abstract and 18 | #' introduction). These unlabeled articles can be used for unsupervised or 19 | #' semi-supervised approaches to sentence classification which rely on a small set 20 | #' of labeled data and a larger set of unlabeled data. 21 | #' 22 | #' ===== References ===== 23 | #' 24 | #' S. Teufel and M. Moens. Summarizing scientific articles: experiments with 25 | #' relevance and rhetorical status. Computational Linguistics, 28(4):409-445, 26 | #' 2002. 27 | #' 28 | #' S. Teufel. Argumentative zoning: information extraction from scientific 29 | #' text. PhD thesis, School of Informatics, University of Edinburgh, 1999. 30 | #' 31 | #' @format 2 data frame with 3117 rows and 2 variables: 32 | #' \describe{ 33 | #' \item{text}{the sentences as a character vector} 34 | #' \item{class.text}{the category of the sentence} 35 | #' } 36 | #' @source \url{https://archive.ics.uci.edu/ml/index.php} 37 | "train_sentences" 38 | 39 | #' Sentence corpus - test part 40 | #' 41 | #' This corpus contains sentences from 42 | #' the abstract and introduction of 30 scientific articles that have been 43 | #' annotated (i.e. labeled or tagged) according to a modified version of the 44 | #' Argumentative Zones annotation scheme. 45 | #' 46 | #' These 30 scientific articles come 47 | #' from three different domains: 48 | #' 1. PLoS Computational Biology (PLOS) 49 | #' 2. The machine learning repository on arXiv (ARXIV) 50 | #' 3. The psychology journal Judgment and Decision Making (JDM) 51 | #' 52 | #' There are 10 articles from each domain. In addition to the labeled data, this 53 | #' corpus also contains a corresponding set of unlabeled articles. These unlabeled 54 | #' articles also come from PLOS, ARXIV, and JDM. There are 300 unlabeled articles 55 | #' from each domain (again, only the sentences from the abstract and 56 | #' introduction). These unlabeled articles can be used for unsupervised or 57 | #' semi-supervised approaches to sentence classification which rely on a small set 58 | #' of labeled data and a larger set of unlabeled data. 59 | #' 60 | #' ===== References ===== 61 | #' 62 | #' S. Teufel and M. Moens. Summarizing scientific articles: experiments with 63 | #' relevance and rhetorical status. Computational Linguistics, 28(4):409-445, 64 | #' 2002. 65 | #' 66 | #' S. Teufel. Argumentative zoning: information extraction from scientific 67 | #' text. PhD thesis, School of Informatics, University of Edinburgh, 1999. 68 | #' 69 | #' @format 2 data frame with 3117 rows and 2 variables: 70 | #' \describe{ 71 | #' \item{text}{the sentences as a character vector} 72 | #' \item{class.text}{the category of the sentence} 73 | #' } 74 | #' @source \url{https://archive.ics.uci.edu/ml/index.php} 75 | "test_sentences" 76 | 77 | #' Stop words list 78 | #' 79 | #' List of words that can be safely removed from sentences. 80 | #' 81 | #' @format Character vector of stop words 82 | #' @source \url{https://archive.ics.uci.edu/ml/index.php} 83 | "stop_words_sentences" 84 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $("#sidebar") 6 | .stick_in_parent({offset_top: 40}) 7 | .on('sticky_kit:bottom', function(e) { 8 | $(this).parent().css('position', 'static'); 9 | }) 10 | .on('sticky_kit:unbottom', function(e) { 11 | $(this).parent().css('position', 'relative'); 12 | }); 13 | 14 | $('body').scrollspy({ 15 | target: '#sidebar', 16 | offset: 60 17 | }); 18 | 19 | $('[data-toggle="tooltip"]').tooltip(); 20 | 21 | var cur_path = paths(location.pathname); 22 | var links = $("#navbar ul li a"); 23 | var max_length = -1; 24 | var pos = -1; 25 | for (var i = 0; i < links.length; i++) { 26 | if (links[i].getAttribute("href") === "#") 27 | continue; 28 | // Ignore external links 29 | if (links[i].host !== location.host) 30 | continue; 31 | 32 | var nav_path = paths(links[i].pathname); 33 | 34 | var length = prefix_length(nav_path, cur_path); 35 | if (length > max_length) { 36 | max_length = length; 37 | pos = i; 38 | } 39 | } 40 | 41 | // Add class to parent
  • , and enclosing
  • if in dropdown 42 | if (pos >= 0) { 43 | var menu_anchor = $(links[pos]); 44 | menu_anchor.parent().addClass("active"); 45 | menu_anchor.closest("li.dropdown").addClass("active"); 46 | } 47 | }); 48 | 49 | function paths(pathname) { 50 | var pieces = pathname.split("/"); 51 | pieces.shift(); // always starts with / 52 | 53 | var end = pieces[pieces.length - 1]; 54 | if (end === "index.html" || end === "") 55 | pieces.pop(); 56 | return(pieces); 57 | } 58 | 59 | // Returns -1 if not found 60 | function prefix_length(needle, haystack) { 61 | if (needle.length > haystack.length) 62 | return(-1); 63 | 64 | // Special case for length-0 haystack, since for loop won't run 65 | if (haystack.length === 0) { 66 | return(needle.length === 0 ? 0 : -1); 67 | } 68 | 69 | for (var i = 0; i < haystack.length; i++) { 70 | if (needle[i] != haystack[i]) 71 | return(i); 72 | } 73 | 74 | return(haystack.length); 75 | } 76 | 77 | /* Clipboard --------------------------*/ 78 | 79 | function changeTooltipMessage(element, msg) { 80 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 81 | element.setAttribute('data-original-title', msg); 82 | $(element).tooltip('show'); 83 | element.setAttribute('data-original-title', tooltipOriginalTitle); 84 | } 85 | 86 | if(ClipboardJS.isSupported()) { 87 | $(document).ready(function() { 88 | var copyButton = ""; 89 | 90 | $(".examples, div.sourceCode").addClass("hasCopyButton"); 91 | 92 | // Insert copy buttons: 93 | $(copyButton).prependTo(".hasCopyButton"); 94 | 95 | // Initialize tooltips: 96 | $('.btn-copy-ex').tooltip({container: 'body'}); 97 | 98 | // Initialize clipboard: 99 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 100 | text: function(trigger) { 101 | return trigger.parentNode.textContent; 102 | } 103 | }); 104 | 105 | clipboardBtnCopies.on('success', function(e) { 106 | changeTooltipMessage(e.trigger, 'Copied!'); 107 | e.clearSelection(); 108 | }); 109 | 110 | clipboardBtnCopies.on('error', function() { 111 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 112 | }); 113 | }); 114 | } 115 | })(window.jQuery || window.$) 116 | -------------------------------------------------------------------------------- /src/fasttext/quantmatrix.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "quantmatrix.h" 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | namespace fasttext { 16 | 17 | QuantMatrix::QuantMatrix() : Matrix(), qnorm_(false), codesize_(0) {} 18 | 19 | QuantMatrix::QuantMatrix(DenseMatrix&& mat, int32_t dsub, bool qnorm) 20 | : Matrix(mat.size(0), mat.size(1)), 21 | qnorm_(qnorm), 22 | codesize_(mat.size(0) * ((mat.size(1) + dsub - 1) / dsub)) { 23 | codes_.resize(codesize_); 24 | pq_ = std::unique_ptr(new ProductQuantizer(n_, dsub)); 25 | if (qnorm_) { 26 | norm_codes_.resize(m_); 27 | npq_ = std::unique_ptr(new ProductQuantizer(1, 1)); 28 | } 29 | quantize(std::forward(mat)); 30 | } 31 | 32 | void QuantMatrix::quantizeNorm(const Vector& norms) { 33 | assert(qnorm_); 34 | assert(norms.size() == m_); 35 | auto dataptr = norms.data(); 36 | npq_->train(m_, dataptr); 37 | npq_->compute_codes(dataptr, norm_codes_.data(), m_); 38 | } 39 | 40 | void QuantMatrix::quantize(DenseMatrix&& mat) { 41 | if (qnorm_) { 42 | Vector norms(mat.size(0)); 43 | mat.l2NormRow(norms); 44 | mat.divideRow(norms); 45 | quantizeNorm(norms); 46 | } 47 | auto dataptr = mat.data(); 48 | pq_->train(m_, dataptr); 49 | pq_->compute_codes(dataptr, codes_.data(), m_); 50 | } 51 | 52 | real QuantMatrix::dotRow(const Vector& vec, int64_t i) const { 53 | assert(i >= 0); 54 | assert(i < m_); 55 | assert(vec.size() == n_); 56 | real norm = 1; 57 | if (qnorm_) { 58 | norm = npq_->get_centroids(0, norm_codes_[i])[0]; 59 | } 60 | return pq_->mulcode(vec, codes_.data(), i, norm); 61 | } 62 | 63 | void QuantMatrix::addVectorToRow(const Vector&, int64_t, real) { 64 | throw std::runtime_error("Operation not permitted on quantized matrices."); 65 | } 66 | 67 | void QuantMatrix::addRowToVector(Vector& x, int32_t i, real a) const { 68 | real norm = 1; 69 | if (qnorm_) { 70 | norm = npq_->get_centroids(0, norm_codes_[i])[0]; 71 | } 72 | pq_->addcode(x, codes_.data(), i, a * norm); 73 | } 74 | 75 | void QuantMatrix::addRowToVector(Vector& x, int32_t i) const { 76 | real norm = 1; 77 | if (qnorm_) { 78 | norm = npq_->get_centroids(0, norm_codes_[i])[0]; 79 | } 80 | pq_->addcode(x, codes_.data(), i, norm); 81 | } 82 | 83 | void QuantMatrix::save(std::ostream& out) const { 84 | out.write((char*)&qnorm_, sizeof(qnorm_)); 85 | out.write((char*)&m_, sizeof(m_)); 86 | out.write((char*)&n_, sizeof(n_)); 87 | out.write((char*)&codesize_, sizeof(codesize_)); 88 | out.write((char*)codes_.data(), codesize_ * sizeof(uint8_t)); 89 | pq_->save(out); 90 | if (qnorm_) { 91 | out.write((char*)norm_codes_.data(), m_ * sizeof(uint8_t)); 92 | npq_->save(out); 93 | } 94 | } 95 | 96 | void QuantMatrix::load(std::istream& in) { 97 | in.read((char*)&qnorm_, sizeof(qnorm_)); 98 | in.read((char*)&m_, sizeof(m_)); 99 | in.read((char*)&n_, sizeof(n_)); 100 | in.read((char*)&codesize_, sizeof(codesize_)); 101 | codes_ = std::vector(codesize_); 102 | in.read((char*)codes_.data(), codesize_ * sizeof(uint8_t)); 103 | pq_ = std::unique_ptr(new ProductQuantizer()); 104 | pq_->load(in); 105 | if (qnorm_) { 106 | norm_codes_ = std::vector(m_); 107 | in.read((char*)norm_codes_.data(), m_ * sizeof(uint8_t)); 108 | npq_ = std::unique_ptr(new ProductQuantizer()); 109 | npq_->load(in); 110 | } 111 | } 112 | 113 | void QuantMatrix::dump(std::ostream&) const { 114 | throw std::runtime_error("Operation not permitted on quantized matrices."); 115 | } 116 | 117 | } // namespace fasttext 118 | -------------------------------------------------------------------------------- /src/fasttext/loss.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "matrix.h" 16 | #include "model.h" 17 | #include "real.h" 18 | #include "utils.h" 19 | #include "vector.h" 20 | 21 | namespace fasttext { 22 | 23 | class Loss { 24 | private: 25 | void findKBest( 26 | int32_t k, 27 | real threshold, 28 | Predictions& heap, 29 | const Vector& output) const; 30 | 31 | protected: 32 | std::vector t_sigmoid_; 33 | std::vector t_log_; 34 | std::shared_ptr& wo_; 35 | 36 | real log(real x) const; 37 | real sigmoid(real x) const; 38 | 39 | public: 40 | explicit Loss(std::shared_ptr& wo); 41 | virtual ~Loss() = default; 42 | 43 | virtual real forward( 44 | const std::vector& targets, 45 | int32_t targetIndex, 46 | Model::State& state, 47 | real lr, 48 | bool backprop) = 0; 49 | virtual void computeOutput(Model::State& state) const = 0; 50 | 51 | virtual void predict( 52 | int32_t /*k*/, 53 | real /*threshold*/, 54 | Predictions& /*heap*/, 55 | Model::State& /*state*/) const; 56 | }; 57 | 58 | class BinaryLogisticLoss : public Loss { 59 | protected: 60 | real binaryLogistic( 61 | int32_t target, 62 | Model::State& state, 63 | bool labelIsPositive, 64 | real lr, 65 | bool backprop) const; 66 | 67 | public: 68 | explicit BinaryLogisticLoss(std::shared_ptr& wo); 69 | virtual ~BinaryLogisticLoss() noexcept override = default; 70 | void computeOutput(Model::State& state) const override; 71 | }; 72 | 73 | class OneVsAllLoss : public BinaryLogisticLoss { 74 | public: 75 | explicit OneVsAllLoss(std::shared_ptr& wo); 76 | ~OneVsAllLoss() noexcept override = default; 77 | real forward( 78 | const std::vector& targets, 79 | int32_t targetIndex, 80 | Model::State& state, 81 | real lr, 82 | bool backprop) override; 83 | }; 84 | 85 | class NegativeSamplingLoss : public BinaryLogisticLoss { 86 | protected: 87 | static const int32_t NEGATIVE_TABLE_SIZE = 10000000; 88 | 89 | int neg_; 90 | std::vector negatives_; 91 | std::uniform_int_distribution uniform_; 92 | int32_t getNegative(int32_t target, std::minstd_rand& rng); 93 | 94 | public: 95 | explicit NegativeSamplingLoss( 96 | std::shared_ptr& wo, 97 | int neg, 98 | const std::vector& targetCounts); 99 | ~NegativeSamplingLoss() noexcept override = default; 100 | 101 | real forward( 102 | const std::vector& targets, 103 | int32_t targetIndex, 104 | Model::State& state, 105 | real lr, 106 | bool backprop) override; 107 | }; 108 | 109 | class HierarchicalSoftmaxLoss : public BinaryLogisticLoss { 110 | protected: 111 | struct Node { 112 | int32_t parent; 113 | int32_t left; 114 | int32_t right; 115 | int64_t count; 116 | bool binary; 117 | }; 118 | 119 | std::vector> paths_; 120 | std::vector> codes_; 121 | std::vector tree_; 122 | int32_t osz_; 123 | void buildTree(const std::vector& counts); 124 | void dfs( 125 | int32_t k, 126 | real threshold, 127 | int32_t node, 128 | real score, 129 | Predictions& heap, 130 | const Vector& hidden) const; 131 | 132 | public: 133 | explicit HierarchicalSoftmaxLoss( 134 | std::shared_ptr& wo, 135 | const std::vector& counts); 136 | ~HierarchicalSoftmaxLoss() noexcept override = default; 137 | real forward( 138 | const std::vector& targets, 139 | int32_t targetIndex, 140 | Model::State& state, 141 | real lr, 142 | bool backprop) override; 143 | void predict( 144 | int32_t k, 145 | real threshold, 146 | Predictions& heap, 147 | Model::State& state) const override; 148 | }; 149 | 150 | class SoftmaxLoss : public Loss { 151 | public: 152 | explicit SoftmaxLoss(std::shared_ptr& wo); 153 | ~SoftmaxLoss() noexcept override = default; 154 | real forward( 155 | const std::vector& targets, 156 | int32_t targetIndex, 157 | Model::State& state, 158 | real lr, 159 | bool backprop) override; 160 | void computeOutput(Model::State& state) const override; 161 | }; 162 | 163 | } // namespace fasttext 164 | -------------------------------------------------------------------------------- /src/fasttext/densematrix.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "densematrix.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "utils.h" 16 | #include "vector.h" 17 | 18 | namespace fasttext { 19 | 20 | DenseMatrix::DenseMatrix() : DenseMatrix(0, 0) {} 21 | 22 | DenseMatrix::DenseMatrix(int64_t m, int64_t n) : Matrix(m, n), data_(m * n) {} 23 | 24 | DenseMatrix::DenseMatrix(DenseMatrix&& other) noexcept 25 | : Matrix(other.m_, other.n_), data_(std::move(other.data_)) {} 26 | 27 | void DenseMatrix::zero() { 28 | std::fill(data_.begin(), data_.end(), 0.0); 29 | } 30 | 31 | void DenseMatrix::uniformThread(real a, int block, int32_t seed) { 32 | std::minstd_rand rng(block + seed); 33 | std::uniform_real_distribution<> uniform(-a, a); 34 | int64_t blockSize = (m_ * n_) / 10; 35 | for (int64_t i = blockSize * block; 36 | i < (m_ * n_) && i < blockSize * (block + 1); 37 | i++) { 38 | data_[i] = uniform(rng); 39 | } 40 | } 41 | 42 | void DenseMatrix::uniform(real a, unsigned int thread, int32_t seed) { 43 | std::vector threads; 44 | for (int i = 0; i < thread; i++) { 45 | threads.push_back(std::thread([=]() { uniformThread(a, i, seed); })); 46 | } 47 | for (int32_t i = 0; i < threads.size(); i++) { 48 | threads[i].join(); 49 | } 50 | } 51 | 52 | void DenseMatrix::multiplyRow(const Vector& nums, int64_t ib, int64_t ie) { 53 | if (ie == -1) { 54 | ie = m_; 55 | } 56 | assert(ie <= nums.size()); 57 | for (auto i = ib; i < ie; i++) { 58 | real n = nums[i - ib]; 59 | if (n != 0) { 60 | for (auto j = 0; j < n_; j++) { 61 | at(i, j) *= n; 62 | } 63 | } 64 | } 65 | } 66 | 67 | void DenseMatrix::divideRow(const Vector& denoms, int64_t ib, int64_t ie) { 68 | if (ie == -1) { 69 | ie = m_; 70 | } 71 | assert(ie <= denoms.size()); 72 | for (auto i = ib; i < ie; i++) { 73 | real n = denoms[i - ib]; 74 | if (n != 0) { 75 | for (auto j = 0; j < n_; j++) { 76 | at(i, j) /= n; 77 | } 78 | } 79 | } 80 | } 81 | 82 | real DenseMatrix::l2NormRow(int64_t i) const { 83 | auto norm = 0.0; 84 | for (auto j = 0; j < n_; j++) { 85 | norm += at(i, j) * at(i, j); 86 | } 87 | if (std::isnan(norm)) { 88 | throw EncounteredNaNError(); 89 | } 90 | return std::sqrt(norm); 91 | } 92 | 93 | void DenseMatrix::l2NormRow(Vector& norms) const { 94 | assert(norms.size() == m_); 95 | for (auto i = 0; i < m_; i++) { 96 | norms[i] = l2NormRow(i); 97 | } 98 | } 99 | 100 | real DenseMatrix::dotRow(const Vector& vec, int64_t i) const { 101 | assert(i >= 0); 102 | assert(i < m_); 103 | assert(vec.size() == n_); 104 | real d = 0.0; 105 | for (int64_t j = 0; j < n_; j++) { 106 | d += at(i, j) * vec[j]; 107 | } 108 | if (std::isnan(d)) { 109 | throw EncounteredNaNError(); 110 | } 111 | return d; 112 | } 113 | 114 | void DenseMatrix::addVectorToRow(const Vector& vec, int64_t i, real a) { 115 | assert(i >= 0); 116 | assert(i < m_); 117 | assert(vec.size() == n_); 118 | for (int64_t j = 0; j < n_; j++) { 119 | data_[i * n_ + j] += a * vec[j]; 120 | } 121 | } 122 | 123 | void DenseMatrix::addRowToVector(Vector& x, int32_t i) const { 124 | assert(i >= 0); 125 | assert(i < this->size(0)); 126 | assert(x.size() == this->size(1)); 127 | for (int64_t j = 0; j < n_; j++) { 128 | x[j] += at(i, j); 129 | } 130 | } 131 | 132 | void DenseMatrix::addRowToVector(Vector& x, int32_t i, real a) const { 133 | assert(i >= 0); 134 | assert(i < this->size(0)); 135 | assert(x.size() == this->size(1)); 136 | for (int64_t j = 0; j < n_; j++) { 137 | x[j] += a * at(i, j); 138 | } 139 | } 140 | 141 | void DenseMatrix::save(std::ostream& out) const { 142 | out.write((char*)&m_, sizeof(int64_t)); 143 | out.write((char*)&n_, sizeof(int64_t)); 144 | out.write((char*)data_.data(), m_ * n_ * sizeof(real)); 145 | } 146 | 147 | void DenseMatrix::load(std::istream& in) { 148 | in.read((char*)&m_, sizeof(int64_t)); 149 | in.read((char*)&n_, sizeof(int64_t)); 150 | data_ = std::vector(m_ * n_); 151 | in.read((char*)data_.data(), m_ * n_ * sizeof(real)); 152 | } 153 | 154 | void DenseMatrix::dump(std::ostream& out) const { 155 | out << m_ << " " << n_ << std::endl; 156 | for (int64_t i = 0; i < m_; i++) { 157 | for (int64_t j = 0; j < n_; j++) { 158 | if (j > 0) { 159 | out << " "; 160 | } 161 | out << at(i, j); 162 | } 163 | out << std::endl; 164 | } 165 | }; 166 | 167 | } // namespace fasttext 168 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include "../inst/include/fastrtext.h" 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace Rcpp; 10 | 11 | // add_prefix 12 | CharacterVector add_prefix(const CharacterVector& texts, CharacterVector prefix); 13 | static SEXP _fastrtext_add_prefix_try(SEXP textsSEXP, SEXP prefixSEXP) { 14 | BEGIN_RCPP 15 | Rcpp::RObject rcpp_result_gen; 16 | Rcpp::traits::input_parameter< const CharacterVector& >::type texts(textsSEXP); 17 | Rcpp::traits::input_parameter< CharacterVector >::type prefix(prefixSEXP); 18 | rcpp_result_gen = Rcpp::wrap(add_prefix(texts, prefix)); 19 | return rcpp_result_gen; 20 | END_RCPP_RETURN_ERROR 21 | } 22 | RcppExport SEXP _fastrtext_add_prefix(SEXP textsSEXP, SEXP prefixSEXP) { 23 | SEXP rcpp_result_gen; 24 | { 25 | Rcpp::RNGScope rcpp_rngScope_gen; 26 | rcpp_result_gen = PROTECT(_fastrtext_add_prefix_try(textsSEXP, prefixSEXP)); 27 | } 28 | Rboolean rcpp_isInterrupt_gen = Rf_inherits(rcpp_result_gen, "interrupted-error"); 29 | if (rcpp_isInterrupt_gen) { 30 | UNPROTECT(1); 31 | Rf_onintr(); 32 | } 33 | bool rcpp_isLongjump_gen = Rcpp::internal::isLongjumpSentinel(rcpp_result_gen); 34 | if (rcpp_isLongjump_gen) { 35 | Rcpp::internal::resumeJump(rcpp_result_gen); 36 | } 37 | Rboolean rcpp_isError_gen = Rf_inherits(rcpp_result_gen, "try-error"); 38 | if (rcpp_isError_gen) { 39 | SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); 40 | UNPROTECT(1); 41 | Rf_error(CHAR(rcpp_msgSEXP_gen)); 42 | } 43 | UNPROTECT(1); 44 | return rcpp_result_gen; 45 | } 46 | // add_pr 47 | std::string add_pr(const std::string& line, const std::string& prefix); 48 | static SEXP _fastrtext_add_pr_try(SEXP lineSEXP, SEXP prefixSEXP) { 49 | BEGIN_RCPP 50 | Rcpp::RObject rcpp_result_gen; 51 | Rcpp::traits::input_parameter< const std::string& >::type line(lineSEXP); 52 | Rcpp::traits::input_parameter< const std::string& >::type prefix(prefixSEXP); 53 | rcpp_result_gen = Rcpp::wrap(add_pr(line, prefix)); 54 | return rcpp_result_gen; 55 | END_RCPP_RETURN_ERROR 56 | } 57 | RcppExport SEXP _fastrtext_add_pr(SEXP lineSEXP, SEXP prefixSEXP) { 58 | SEXP rcpp_result_gen; 59 | { 60 | Rcpp::RNGScope rcpp_rngScope_gen; 61 | rcpp_result_gen = PROTECT(_fastrtext_add_pr_try(lineSEXP, prefixSEXP)); 62 | } 63 | Rboolean rcpp_isInterrupt_gen = Rf_inherits(rcpp_result_gen, "interrupted-error"); 64 | if (rcpp_isInterrupt_gen) { 65 | UNPROTECT(1); 66 | Rf_onintr(); 67 | } 68 | bool rcpp_isLongjump_gen = Rcpp::internal::isLongjumpSentinel(rcpp_result_gen); 69 | if (rcpp_isLongjump_gen) { 70 | Rcpp::internal::resumeJump(rcpp_result_gen); 71 | } 72 | Rboolean rcpp_isError_gen = Rf_inherits(rcpp_result_gen, "try-error"); 73 | if (rcpp_isError_gen) { 74 | SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); 75 | UNPROTECT(1); 76 | Rf_error(CHAR(rcpp_msgSEXP_gen)); 77 | } 78 | UNPROTECT(1); 79 | return rcpp_result_gen; 80 | } 81 | 82 | // validate (ensure exported C++ functions exist before calling them) 83 | static int _fastrtext_RcppExport_validate(const char* sig) { 84 | static std::set signatures; 85 | if (signatures.empty()) { 86 | signatures.insert("CharacterVector(*add_prefix)(const CharacterVector&,CharacterVector)"); 87 | signatures.insert("std::string(*add_pr)(const std::string&,const std::string&)"); 88 | } 89 | return signatures.find(sig) != signatures.end(); 90 | } 91 | 92 | // registerCCallable (register entry points for exported C++ functions) 93 | RcppExport SEXP _fastrtext_RcppExport_registerCCallable() { 94 | R_RegisterCCallable("fastrtext", "_fastrtext_add_prefix", (DL_FUNC)_fastrtext_add_prefix_try); 95 | R_RegisterCCallable("fastrtext", "_fastrtext_add_pr", (DL_FUNC)_fastrtext_add_pr_try); 96 | R_RegisterCCallable("fastrtext", "_fastrtext_RcppExport_validate", (DL_FUNC)_fastrtext_RcppExport_validate); 97 | return R_NilValue; 98 | } 99 | 100 | RcppExport SEXP _rcpp_module_boot_FASTRTEXT_MODULE(); 101 | 102 | static const R_CallMethodDef CallEntries[] = { 103 | {"_fastrtext_add_prefix", (DL_FUNC) &_fastrtext_add_prefix, 2}, 104 | {"_fastrtext_add_pr", (DL_FUNC) &_fastrtext_add_pr, 2}, 105 | {"_rcpp_module_boot_FASTRTEXT_MODULE", (DL_FUNC) &_rcpp_module_boot_FASTRTEXT_MODULE, 0}, 106 | {"_fastrtext_RcppExport_registerCCallable", (DL_FUNC) &_fastrtext_RcppExport_registerCCallable, 0}, 107 | {NULL, NULL, 0} 108 | }; 109 | 110 | RcppExport void R_init_fastrtext(DllInfo *dll) { 111 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 112 | R_useDynamicSymbols(dll, FALSE); 113 | } 114 | -------------------------------------------------------------------------------- /src/fasttext/fasttext.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "args.h" 22 | #include "densematrix.h" 23 | #include "dictionary.h" 24 | #include "matrix.h" 25 | #include "meter.h" 26 | #include "model.h" 27 | #include "real.h" 28 | #include "utils.h" 29 | #include "vector.h" 30 | 31 | namespace fasttext { 32 | 33 | class FastText { 34 | protected: 35 | std::shared_ptr args_; 36 | std::shared_ptr dict_; 37 | std::shared_ptr input_; 38 | std::shared_ptr output_; 39 | std::shared_ptr model_; 40 | std::atomic tokenCount_{}; 41 | std::atomic loss_{}; 42 | std::chrono::steady_clock::time_point start_; 43 | bool quant_; 44 | int32_t version; 45 | std::unique_ptr wordVectors_; 46 | std::exception_ptr trainException_; 47 | 48 | void signModel(std::ostream&); 49 | bool checkModel(std::istream&); 50 | void startThreads(); 51 | void addInputVector(Vector&, int32_t) const; 52 | void trainThread(int32_t); 53 | std::vector> getNN( 54 | const DenseMatrix& wordVectors, 55 | const Vector& queryVec, 56 | int32_t k, 57 | const std::set& banSet); 58 | void lazyComputeWordVectors(); 59 | void printInfo(real, real, std::ostream&); 60 | std::shared_ptr getInputMatrixFromFile(const std::string&) const; 61 | std::shared_ptr createRandomMatrix() const; 62 | std::shared_ptr createTrainOutputMatrix() const; 63 | std::vector getTargetCounts() const; 64 | std::shared_ptr createLoss(std::shared_ptr& output); 65 | void supervised( 66 | Model::State& state, 67 | real lr, 68 | const std::vector& line, 69 | const std::vector& labels); 70 | void cbow(Model::State& state, real lr, const std::vector& line); 71 | void skipgram(Model::State& state, real lr, const std::vector& line); 72 | std::vector selectEmbeddings(int32_t cutoff) const; 73 | void precomputeWordVectors(DenseMatrix& wordVectors); 74 | bool keepTraining(const int64_t ntokens) const; 75 | 76 | public: 77 | FastText(); 78 | 79 | int32_t getWordId(const std::string& word) const; 80 | 81 | int32_t getSubwordId(const std::string& subword) const; 82 | 83 | void getWordVector(Vector& vec, const std::string& word) const; 84 | 85 | void getSubwordVector(Vector& vec, const std::string& subword) const; 86 | 87 | inline void getInputVector(Vector& vec, int32_t ind) { 88 | vec.zero(); 89 | addInputVector(vec, ind); 90 | } 91 | 92 | const Args getArgs() const; 93 | 94 | std::shared_ptr getDictionary() const; 95 | 96 | std::shared_ptr getInputMatrix() const; 97 | 98 | std::shared_ptr getOutputMatrix() const; 99 | 100 | void saveVectors(const std::string& filename); 101 | 102 | void saveModel(const std::string& filename); 103 | 104 | void saveOutput(const std::string& filename); 105 | 106 | void loadModel(std::istream& in); 107 | 108 | void loadModel(const std::string& filename); 109 | 110 | void getSentenceVector(std::istream& in, Vector& vec); 111 | 112 | void quantize(const Args& qargs); 113 | 114 | std::tuple 115 | test(std::istream& in, int32_t k, real threshold = 0.0); 116 | 117 | void test(std::istream& in, int32_t k, real threshold, Meter& meter) const; 118 | 119 | void predict( 120 | int32_t k, 121 | const std::vector& words, 122 | Predictions& predictions, 123 | real threshold = 0.0) const; 124 | 125 | bool predictLine( 126 | std::istream& in, 127 | std::vector>& predictions, 128 | int32_t k, 129 | real threshold) const; 130 | 131 | std::vector> getNgramVectors( 132 | const std::string& word) const; 133 | 134 | std::vector> getNN( 135 | const std::string& word, 136 | int32_t k); 137 | 138 | std::vector> getAnalogies( 139 | int32_t k, 140 | const std::string& wordA, 141 | const std::string& wordB, 142 | const std::string& wordC); 143 | 144 | void train(const Args& args); 145 | 146 | void abort(); 147 | 148 | int getDimension() const; 149 | 150 | bool isQuant() const; 151 | 152 | class AbortError : public std::runtime_error { 153 | public: 154 | AbortError() : std::runtime_error("Aborted.") {} 155 | }; 156 | }; 157 | } // namespace fasttext 158 | -------------------------------------------------------------------------------- /docs/LICENSE-text.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | License • fastrtext 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
    51 |
    52 | 113 | 114 | 115 |
    116 | 117 |
    118 |
    119 | 122 | 123 |
    YEAR: 2017
    124 | COPYRIGHT HOLDER: Michaël Benesty
    125 | 
    126 | 127 |
    128 | 129 |
    130 | 131 | 132 |
    133 | 136 | 137 |
    138 |

    Site built with pkgdown 1.3.0.

    139 |
    140 |
    141 |
    142 | 143 | 144 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticky footer */ 2 | 3 | /** 4 | * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ 5 | * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css 6 | * 7 | * .Site -> body > .container 8 | * .Site-content -> body > .container .row 9 | * .footer -> footer 10 | * 11 | * Key idea seems to be to ensure that .container and __all its parents__ 12 | * have height set to 100% 13 | * 14 | */ 15 | 16 | html, body { 17 | height: 100%; 18 | } 19 | 20 | body > .container { 21 | display: flex; 22 | height: 100%; 23 | flex-direction: column; 24 | 25 | padding-top: 60px; 26 | } 27 | 28 | body > .container .row { 29 | flex: 1 0 auto; 30 | } 31 | 32 | footer { 33 | margin-top: 45px; 34 | padding: 35px 0 36px; 35 | border-top: 1px solid #e5e5e5; 36 | color: #666; 37 | display: flex; 38 | flex-shrink: 0; 39 | } 40 | footer p { 41 | margin-bottom: 0; 42 | } 43 | footer div { 44 | flex: 1; 45 | } 46 | footer .pkgdown { 47 | text-align: right; 48 | } 49 | footer p { 50 | margin-bottom: 0; 51 | } 52 | 53 | img.icon { 54 | float: right; 55 | } 56 | 57 | img { 58 | max-width: 100%; 59 | } 60 | 61 | /* Fix bug in bootstrap (only seen in firefox) */ 62 | summary { 63 | display: list-item; 64 | } 65 | 66 | /* Typographic tweaking ---------------------------------*/ 67 | 68 | .contents .page-header { 69 | margin-top: calc(-60px + 1em); 70 | } 71 | 72 | /* Section anchors ---------------------------------*/ 73 | 74 | a.anchor { 75 | margin-left: -30px; 76 | display:inline-block; 77 | width: 30px; 78 | height: 30px; 79 | visibility: hidden; 80 | 81 | background-image: url(./link.svg); 82 | background-repeat: no-repeat; 83 | background-size: 20px 20px; 84 | background-position: center center; 85 | } 86 | 87 | .hasAnchor:hover a.anchor { 88 | visibility: visible; 89 | } 90 | 91 | @media (max-width: 767px) { 92 | .hasAnchor:hover a.anchor { 93 | visibility: hidden; 94 | } 95 | } 96 | 97 | 98 | /* Fixes for fixed navbar --------------------------*/ 99 | 100 | .contents h1, .contents h2, .contents h3, .contents h4 { 101 | padding-top: 60px; 102 | margin-top: -40px; 103 | } 104 | 105 | /* Static header placement on mobile devices */ 106 | @media (max-width: 767px) { 107 | .navbar-fixed-top { 108 | position: absolute; 109 | } 110 | .navbar { 111 | padding: 0; 112 | } 113 | } 114 | 115 | 116 | /* Sidebar --------------------------*/ 117 | 118 | #sidebar { 119 | margin-top: 30px; 120 | } 121 | #sidebar h2 { 122 | font-size: 1.5em; 123 | margin-top: 1em; 124 | } 125 | 126 | #sidebar h2:first-child { 127 | margin-top: 0; 128 | } 129 | 130 | #sidebar .list-unstyled li { 131 | margin-bottom: 0.5em; 132 | } 133 | 134 | .orcid { 135 | height: 16px; 136 | vertical-align: middle; 137 | } 138 | 139 | /* Reference index & topics ----------------------------------------------- */ 140 | 141 | .ref-index th {font-weight: normal;} 142 | 143 | .ref-index td {vertical-align: top;} 144 | .ref-index .icon {width: 40px;} 145 | .ref-index .alias {width: 40%;} 146 | .ref-index-icons .alias {width: calc(40% - 40px);} 147 | .ref-index .title {width: 60%;} 148 | 149 | .ref-arguments th {text-align: right; padding-right: 10px;} 150 | .ref-arguments th, .ref-arguments td {vertical-align: top;} 151 | .ref-arguments .name {width: 20%;} 152 | .ref-arguments .desc {width: 80%;} 153 | 154 | /* Nice scrolling for wide elements --------------------------------------- */ 155 | 156 | table { 157 | display: block; 158 | overflow: auto; 159 | } 160 | 161 | /* Syntax highlighting ---------------------------------------------------- */ 162 | 163 | pre { 164 | word-wrap: normal; 165 | word-break: normal; 166 | border: 1px solid #eee; 167 | } 168 | 169 | pre, code { 170 | background-color: #f8f8f8; 171 | color: #333; 172 | } 173 | 174 | pre code { 175 | overflow: auto; 176 | word-wrap: normal; 177 | white-space: pre; 178 | } 179 | 180 | pre .img { 181 | margin: 5px 0; 182 | } 183 | 184 | pre .img img { 185 | background-color: #fff; 186 | display: block; 187 | height: auto; 188 | } 189 | 190 | code a, pre a { 191 | color: #375f84; 192 | } 193 | 194 | a.sourceLine:hover { 195 | text-decoration: none; 196 | } 197 | 198 | .fl {color: #1514b5;} 199 | .fu {color: #000000;} /* function */ 200 | .ch,.st {color: #036a07;} /* string */ 201 | .kw {color: #264D66;} /* keyword */ 202 | .co {color: #888888;} /* comment */ 203 | 204 | .message { color: black; font-weight: bolder;} 205 | .error { color: orange; font-weight: bolder;} 206 | .warning { color: #6A0366; font-weight: bolder;} 207 | 208 | /* Clipboard --------------------------*/ 209 | 210 | .hasCopyButton { 211 | position: relative; 212 | } 213 | 214 | .btn-copy-ex { 215 | position: absolute; 216 | right: 0; 217 | top: 0; 218 | visibility: hidden; 219 | } 220 | 221 | .hasCopyButton:hover button.btn-copy-ex { 222 | visibility: visible; 223 | } 224 | 225 | /* mark.js ----------------------------*/ 226 | 227 | mark { 228 | background-color: rgba(255, 255, 51, 0.5); 229 | border-bottom: 2px solid rgba(255, 153, 51, 0.3); 230 | padding: 1px; 231 | } 232 | 233 | /* vertical spacing after htmlwidgets */ 234 | .html-widget { 235 | margin-bottom: 10px; 236 | } 237 | -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Authors • fastrtext 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
    51 |
    52 | 113 | 114 | 115 |
    116 | 117 |
    118 |
    119 | 122 | 123 |
      124 |
    • 125 |

      Michaël Benesty. Author, maintainer, copyright holder. 126 |

      127 |
    • 128 |
    • 129 |

      Facebook, Inc. Copyright holder. 130 |

      131 |
    • 132 |
    133 | 134 |
    135 | 136 |
    137 | 138 | 139 |
    140 | 143 | 144 |
    145 |

    Site built with pkgdown 1.3.0.

    146 |
    147 |
    148 |
    149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /docs/news/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Changelog • fastrtext 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
    51 |
    52 | 113 | 114 | 115 |
    116 | 117 |
    118 |
    119 | 123 | 124 |
    125 | 126 | 133 | 134 |
    135 | 136 |
    137 | 140 | 141 |
    142 |

    Site built with pkgdown 1.3.0.

    143 |
    144 |
    145 |
    146 | 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /docs/articles/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Articles • fastrtext 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
    51 |
    52 | 113 | 114 | 115 |
    116 | 117 |
    118 |
    119 | 122 | 123 |
    124 |

    All vignettes

    125 |

    126 | 127 | 132 |
    133 |
    134 |
    135 | 136 |
    137 | 140 | 141 |
    142 |

    Site built with pkgdown 1.3.0.

    143 |
    144 |
    145 |
    146 | 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /tests/testthat/test-supervised.R: -------------------------------------------------------------------------------- 1 | context("Supervised training") 2 | 3 | data("train_sentences") 4 | data("test_sentences") 5 | 6 | test_labels <- paste0("__label__", test_sentences[, "class.text"]) 7 | test_labels_without_prefix <- test_sentences[, "class.text"] 8 | test_texts <- tolower(test_sentences[, "text"]) 9 | test_sentences_with_labels <- paste(test_labels, test_texts) 10 | 11 | model_test_path <- system.file("extdata", 12 | "model_classification_test.bin", 13 | package = "fastrtext") 14 | 15 | test_that("Training of a classification model", { 16 | # prepare data 17 | tmp_file_model <- tempfile() 18 | tmp_file_model_quantize <- tempfile() 19 | 20 | train_labels <- paste0("__label__", train_sentences[, "class.text"]) 21 | train_texts <- tolower(train_sentences[, "text"]) 22 | train_to_write <- paste(train_labels, train_texts) 23 | train_tmp_file_txt <- tempfile() 24 | writeLines(text = train_to_write, con = train_tmp_file_txt) 25 | 26 | # learn model 27 | execute(commands = 28 | c("supervised", 29 | "-input", train_tmp_file_txt, 30 | "-output", tmp_file_model, 31 | "-dim", 10, 32 | "-lr", 1, 33 | "-epoch", 10, 34 | "-bucket", 1e4, 35 | "-verbose", 0)) 36 | 37 | # Check learned file exists 38 | expect_true(file.exists(paste0(tmp_file_model, ".bin"))) 39 | 40 | learned_model <- load_model(tmp_file_model) 41 | learned_model_predictions <- predict(learned_model, 42 | sentences = test_sentences_with_labels) 43 | 44 | # Compare with embedded model 45 | embedded_model <- load_model(model_test_path) 46 | embedded_model_predictions <- predict(embedded_model, 47 | sentences = test_sentences_with_labels) 48 | expect_gt(mean(names(unlist(learned_model_predictions)) == 49 | names(unlist(embedded_model_predictions))), 0.75) 50 | 51 | build_supervised(documents = train_texts, 52 | targets = train_sentences[, "class.text"], 53 | model_path = tmp_file_model, 54 | dim = 10, 55 | lr = 1, 56 | epoch = 10, 57 | bucket = 1e4, 58 | verbose = 0) 59 | 60 | expect_true(file.exists(paste0(tmp_file_model, ".bin"))) 61 | 62 | learned_model <- load_model(tmp_file_model) 63 | learned_model_predictions_bis <- predict(learned_model, 64 | sentences = test_sentences_with_labels) 65 | 66 | expect_gt(object = mean(names(unlist(learned_model_predictions)) == names(unlist(learned_model_predictions_bis))), 67 | expected = 0.75) 68 | 69 | # check with simplify = TRUE 70 | embedded_model_predictions_bis <- predict(embedded_model, 71 | sentences = test_sentences_with_labels, 72 | simplify = TRUE) 73 | expect_true(is.numeric(embedded_model_predictions_bis)) 74 | expect_gt(mean(names(unlist(learned_model_predictions)) == 75 | names(embedded_model_predictions_bis)), 0.75) 76 | 77 | # Compare with quantize model 78 | # execute(commands = c("quantize", 79 | # "-output", tmp_file_model, 80 | # "-input", train_tmp_file_txt, 81 | # "-qnorm", 82 | # "-retrain", 83 | # "-epoch", 10, 84 | # "-cutoff", 100000)) 85 | # 86 | # expect_true(file.exists(paste0(tmp_file_model, ".ftz"))) 87 | # quantized_model <- load_model(paste0(tmp_file_model, ".ftz")) 88 | # quantized_model_predictions <- predict(quantized_model, 89 | # sentences = test_sentences_with_labels) 90 | # expect_gt(mean(names(unlist(embedded_model_predictions_bis)) == 91 | # names(unlist(quantized_model_predictions))), 0.75) 92 | }) 93 | 94 | test_that("Test predictions", { 95 | model <- load_model(model_test_path) 96 | predictions <- predict(model, sentences = test_sentences_with_labels) 97 | 98 | # test measure (for 1 class, hamming == accuracy) 99 | expect_equal(get_hamming_loss(as.list(test_labels_without_prefix), predictions), 100 | mean(sapply(predictions, names) == test_labels_without_prefix)) 101 | 102 | expect_gt(get_hamming_loss(as.list(test_labels_without_prefix), predictions), 0.75) 103 | 104 | predictions <- predict(model, sentences = test_sentences_with_labels) 105 | expect_length(predictions, 600) 106 | expect_equal(unique(lengths(predictions)), 1) 107 | expect_equal(unique(lengths(predict(model, 108 | sentences = test_sentences_with_labels, 109 | k = 2))), 2) 110 | expect_gt(object = mean(sapply(predictions, names) == test_labels_without_prefix), 111 | expected = 0.75) 112 | }) 113 | 114 | test_that("Test parameter extraction", { 115 | model <- load_model(model_test_path) 116 | parameters <- get_parameters(model) 117 | expect_equal(parameters$model_name, "supervised") 118 | }) 119 | 120 | test_that("Test label extraction", { 121 | model <- load_model(model_test_path) 122 | labels_from_model <- get_labels(model) 123 | expect_length(labels_from_model, 15) 124 | }) 125 | 126 | test_that("Test formating documents", { 127 | tags <- list(c(1, 5), 0) 128 | documents <- c("this is a text", "this is another document") 129 | results <- add_tags(documents = documents, tags = tags) 130 | expect_length(results, 2) 131 | expect_equal(results[1], "__label__1 __label__5 this is a text") 132 | 133 | results <- add_tags(documents = documents, tags = c(0, 1)) 134 | expect_length(results, 2) 135 | expect_equal(results[1], "__label__0 this is a text") 136 | }) 137 | 138 | gc() 139 | -------------------------------------------------------------------------------- /index.md: -------------------------------------------------------------------------------- 1 | ![fastrtext](https://github.com/pommedeterresautee/fastrtext/raw/master/tools/logo.png) 2 | ========= 3 | 4 | [![Travis-CI Build Status](https://travis-ci.org/pommedeterresautee/fastrtext.svg?branch=master)](https://travis-ci.org/pommedeterresautee/fastrtext) 5 | [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/pommedeterresautee/fastrtext?branch=master&svg=true)](https://ci.appveyor.com/project/pommedeterresautee/fastrtext) 6 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/fastrtext)](https://cran.r-project.org/package=fastrtext) 7 | [![CRAN_Download](http://cranlogs.r-pkg.org/badges/fastrtext)](http://cran.rstudio.com/web/packages/fastrtext/index.html) 8 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 9 | [![codecov](https://codecov.io/gh/pommedeterresautee/fastrtext/branch/master/graph/badge.svg)](https://codecov.io/gh/pommedeterresautee/fastrtext) 10 | [![Follow](https://img.shields.io/twitter/follow/pommedeterre33.svg?style=social)](https://twitter.com/intent/follow?screen_name=pommedeterre33) 11 | 12 | [R Documentation](https://pommedeterresautee.github.io/fastrtext/) | [Release Notes](https://github.com/pommedeterresautee/fastrtext/blob/master/NEWS.md) | [FAQ](https://fasttext.cc/docs/en/faqs.html) | [Multilingual pretrained models](https://fasttext.cc/docs/en/crawl-vectors.html) 13 | 14 | R wrapper for [fastText](https://github.com/facebookresearch/fastText) C++ code from Facebook. 15 | 16 | FastText is an open-source, free, lightweight library that allows users to learn text representations and text classifiers. It works on standard, generic hardware. Models can later be reduced in size to even fit on mobile devices. 17 | 18 | 19 | License 20 | ------- 21 | 22 | © Contributors, 2018. Licensed under a MIT license. 23 | 24 | Installation 25 | ------------ 26 | 27 | You can install the `fastrtext` package from Cran or Github as follows: 28 | 29 | ```R 30 | # From Cran 31 | install.packages("fastrtext") 32 | 33 | # From Github 34 | # install.packages("devtools") 35 | devtools::install_github("pommedeterresautee/fastrtext") 36 | ``` 37 | 38 | Documentation 39 | ------------- 40 | 41 | All the updated documentation can be reached at this [address](https://pommedeterresautee.github.io/fastrtext/). 42 | 43 | API 44 | --- 45 | 46 | API documentation can be reached at this [address](https://pommedeterresautee.github.io/fastrtext/reference/index.html). 47 | 48 | In particular, command line options are listed [there](https://pommedeterresautee.github.io/fastrtext/articles/list_command.html). 49 | 50 | ### Supervised learning (text classification) 51 | 52 | Data for a multi-class task are embedded in this package. 53 | Follow this [link](https://pommedeterresautee.github.io/fastrtext/articles/supervised_learning.html) to learn a model and then measure the accuracy in 5 minutes. 54 | 55 | 56 | ### Unsupervised learning (word representation) 57 | 58 | Data for a word representation learning task are embedded in this package. 59 | Following this [link](https://pommedeterresautee.github.io/fastrtext/articles/unsupervised_learning.html) will route you to a 5mn tutorial to learn vectorial representation of words (aka word embeddings): 60 | 61 | Alternatives 62 | ------------ 63 | 64 | Why not use the command line client? 65 | 66 | * You can call the client from the client using `system("fasttext ...")` ; 67 | * To get prediction, you will need to write file, make predictions from the command line, then read the results ; 68 | * `fastrtext` makes your life easier by making all these operations in memory ; 69 | * It takes less time, and use less commands ; 70 | * Easy to install from R directly. 71 | 72 | Why not use [fastTextR](https://github.com/mlampros/fastTextR/) ? 73 | 74 | * `fastrtext` implements both supervised and unsupervised parts of `fastText` (`fastTextR` implements only the unsupervised part) ; 75 | * with `fastrtext`, predictions can be done in memory (`fastTextR` requires to write the sentence on hard drive and requires you to read the predictions after) ; 76 | * fastText original source code embedded in fastTextR is not up to date (miss several new features, bug fixes since January 2017). 77 | 78 | References 79 | ---------- 80 | 81 | Please cite [1](#enriching-word-vectors-with-subword-information) if using this code for learning word representations or [2](#bag-of-tricks-for-efficient-text-classification) if using for text classification. 82 | 83 | ### Enriching Word Vectors with Subword Information 84 | 85 | [1] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/abs/1607.04606) 86 | 87 | ``` 88 | @article{bojanowski2016enriching, 89 | title={Enriching Word Vectors with Subword Information}, 90 | author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas}, 91 | journal={arXiv preprint arXiv:1607.04606}, 92 | year={2016} 93 | } 94 | ``` 95 | 96 | ### Bag of Tricks for Efficient Text Classification 97 | 98 | [2] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, [*Bag of Tricks for Efficient Text Classification*](https://arxiv.org/abs/1607.01759) 99 | 100 | ``` 101 | @article{joulin2016bag, 102 | title={Bag of Tricks for Efficient Text Classification}, 103 | author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas}, 104 | journal={arXiv preprint arXiv:1607.01759}, 105 | year={2016} 106 | } 107 | ``` 108 | 109 | ### FastText.zip: Compressing text classification models 110 | 111 | [3] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, [*FastText.zip: Compressing text classification models*](https://arxiv.org/abs/1612.03651) 112 | 113 | ``` 114 | @article{joulin2016fasttext, 115 | title={FastText.zip: Compressing text classification models}, 116 | author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas}, 117 | journal={arXiv preprint arXiv:1612.03651}, 118 | year={2016} 119 | } 120 | ``` 121 | 122 | (\* These authors contributed equally.) 123 | -------------------------------------------------------------------------------- /docs/reference/print_help.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Print help — print_help • fastrtext 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 116 | 117 | 118 |
    119 | 120 |
    121 |
    122 | 127 | 128 |
    129 | 130 |

    Print command information, mainly to use with execute() function.

    131 | 132 |
    133 | 134 |
    print_help()
    135 | 136 | 137 |

    Examples

    138 |
    # NOT RUN {
    139 | print_help()
    140 | # }
    141 |
    142 |
    143 | 151 |
    152 | 153 |
    154 | 157 | 158 |
    159 |

    Site built with pkgdown 1.3.0.

    160 |
    161 |
    162 |
    163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /docs/reference/stop_words_sentences.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Stop words list — stop_words_sentences • fastrtext 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 116 | 117 | 118 |
    119 | 120 |
    121 |
    122 | 127 | 128 |
    129 | 130 |

    List of words that can be safely removed from sentences.

    131 | 132 |
    133 | 134 |
    stop_words_sentences
    135 | 136 |

    Format

    137 | 138 |

    Character vector of stop words

    139 | 140 |

    Source

    141 | 142 |

    https://archive.ics.uci.edu/ml/datasets.html?format=&task=&att=&area=&numAtt=&numIns=&type=text&sort=nameUp&view=table

    143 | 144 | 145 |
    146 | 156 |
    157 | 158 |
    159 | 162 | 163 |
    164 |

    Site built with pkgdown 1.3.0.

    165 |
    166 |
    167 |
    168 | 169 | 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /docs/reference/Rcpp_fastrtext-class.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Rcpp_fastrtext class — Rcpp_fastrtext-class • fastrtext 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 116 | 117 | 118 |
    119 | 120 |
    121 |
    122 | 127 | 128 |
    129 | 130 |

    Models are S4 objects with several slots (methods) which can be called that way: model$slot_name()

    131 | 132 |
    133 | 134 | 135 |

    Slots

    136 | 137 | 138 |
    139 |
    load

    Load a model

    140 |
    predict

    Make a prediction

    141 |
    execute

    Execute commands

    142 |
    get_vectors

    Get vectors related to provided words

    143 |
    get_parameters

    Get parameters used to train the model

    144 |
    get_dictionary

    List all words learned

    145 |
    get_labels

    List all labels learned

    146 |
    147 | 148 | 149 |
    150 | 158 |
    159 | 160 |
    161 | 164 | 165 |
    166 |

    Site built with pkgdown 1.3.0.

    167 |
    168 |
    169 |
    170 | 171 | 172 | 173 | 174 | 175 | 176 | -------------------------------------------------------------------------------- /src/fasttext/productquantizer.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "productquantizer.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace fasttext { 18 | 19 | real distL2(const real* x, const real* y, int32_t d) { 20 | real dist = 0; 21 | for (auto i = 0; i < d; i++) { 22 | auto tmp = x[i] - y[i]; 23 | dist += tmp * tmp; 24 | } 25 | return dist; 26 | } 27 | 28 | ProductQuantizer::ProductQuantizer(int32_t dim, int32_t dsub) 29 | : dim_(dim), 30 | nsubq_(dim / dsub), 31 | dsub_(dsub), 32 | centroids_(dim * ksub_), 33 | rng(seed_) { 34 | lastdsub_ = dim_ % dsub; 35 | if (lastdsub_ == 0) { 36 | lastdsub_ = dsub_; 37 | } else { 38 | nsubq_++; 39 | } 40 | } 41 | 42 | const real* ProductQuantizer::get_centroids(int32_t m, uint8_t i) const { 43 | if (m == nsubq_ - 1) { 44 | return ¢roids_[m * ksub_ * dsub_ + i * lastdsub_]; 45 | } 46 | return ¢roids_[(m * ksub_ + i) * dsub_]; 47 | } 48 | 49 | real* ProductQuantizer::get_centroids(int32_t m, uint8_t i) { 50 | if (m == nsubq_ - 1) { 51 | return ¢roids_[m * ksub_ * dsub_ + i * lastdsub_]; 52 | } 53 | return ¢roids_[(m * ksub_ + i) * dsub_]; 54 | } 55 | 56 | real ProductQuantizer::assign_centroid( 57 | const real* x, 58 | const real* c0, 59 | uint8_t* code, 60 | int32_t d) const { 61 | const real* c = c0; 62 | real dis = distL2(x, c, d); 63 | code[0] = 0; 64 | for (auto j = 1; j < ksub_; j++) { 65 | c += d; 66 | real disij = distL2(x, c, d); 67 | if (disij < dis) { 68 | code[0] = (uint8_t)j; 69 | dis = disij; 70 | } 71 | } 72 | return dis; 73 | } 74 | 75 | void ProductQuantizer::Estep( 76 | const real* x, 77 | const real* centroids, 78 | uint8_t* codes, 79 | int32_t d, 80 | int32_t n) const { 81 | for (auto i = 0; i < n; i++) { 82 | assign_centroid(x + i * d, centroids, codes + i, d); 83 | } 84 | } 85 | 86 | void ProductQuantizer::MStep( 87 | const real* x0, 88 | real* centroids, 89 | const uint8_t* codes, 90 | int32_t d, 91 | int32_t n) { 92 | std::vector nelts(ksub_, 0); 93 | memset(centroids, 0, sizeof(real) * d * ksub_); 94 | const real* x = x0; 95 | for (auto i = 0; i < n; i++) { 96 | auto k = codes[i]; 97 | real* c = centroids + k * d; 98 | for (auto j = 0; j < d; j++) { 99 | c[j] += x[j]; 100 | } 101 | nelts[k]++; 102 | x += d; 103 | } 104 | 105 | real* c = centroids; 106 | for (auto k = 0; k < ksub_; k++) { 107 | real z = (real)nelts[k]; 108 | if (z != 0) { 109 | for (auto j = 0; j < d; j++) { 110 | c[j] /= z; 111 | } 112 | } 113 | c += d; 114 | } 115 | 116 | std::uniform_real_distribution<> runiform(0, 1); 117 | for (auto k = 0; k < ksub_; k++) { 118 | if (nelts[k] == 0) { 119 | int32_t m = 0; 120 | while (runiform(rng) * (n - ksub_) >= nelts[m] - 1) { 121 | m = (m + 1) % ksub_; 122 | } 123 | memcpy(centroids + k * d, centroids + m * d, sizeof(real) * d); 124 | for (auto j = 0; j < d; j++) { 125 | int32_t sign = (j % 2) * 2 - 1; 126 | centroids[k * d + j] += sign * eps_; 127 | centroids[m * d + j] -= sign * eps_; 128 | } 129 | nelts[k] = nelts[m] / 2; 130 | nelts[m] -= nelts[k]; 131 | } 132 | } 133 | } 134 | 135 | void ProductQuantizer::kmeans(const real* x, real* c, int32_t n, int32_t d) { 136 | std::vector perm(n, 0); 137 | std::iota(perm.begin(), perm.end(), 0); 138 | std::shuffle(perm.begin(), perm.end(), rng); 139 | for (auto i = 0; i < ksub_; i++) { 140 | memcpy(&c[i * d], x + perm[i] * d, d * sizeof(real)); 141 | } 142 | auto codes = std::vector(n); 143 | for (auto i = 0; i < niter_; i++) { 144 | Estep(x, c, codes.data(), d, n); 145 | MStep(x, c, codes.data(), d, n); 146 | } 147 | } 148 | 149 | void ProductQuantizer::train(int32_t n, const real* x) { 150 | if (n < ksub_) { 151 | throw std::invalid_argument( 152 | "Matrix too small for quantization, must have at least " + 153 | std::to_string(ksub_) + " rows"); 154 | } 155 | std::vector perm(n, 0); 156 | std::iota(perm.begin(), perm.end(), 0); 157 | auto d = dsub_; 158 | auto np = std::min(n, max_points_); 159 | auto xslice = std::vector(np * dsub_); 160 | for (auto m = 0; m < nsubq_; m++) { 161 | if (m == nsubq_ - 1) { 162 | d = lastdsub_; 163 | } 164 | if (np != n) { 165 | std::shuffle(perm.begin(), perm.end(), rng); 166 | } 167 | for (auto j = 0; j < np; j++) { 168 | memcpy( 169 | xslice.data() + j * d, 170 | x + perm[j] * dim_ + m * dsub_, 171 | d * sizeof(real)); 172 | } 173 | kmeans(xslice.data(), get_centroids(m, 0), np, d); 174 | } 175 | } 176 | 177 | real ProductQuantizer::mulcode( 178 | const Vector& x, 179 | const uint8_t* codes, 180 | int32_t t, 181 | real alpha) const { 182 | real res = 0.0; 183 | auto d = dsub_; 184 | const uint8_t* code = codes + nsubq_ * t; 185 | for (auto m = 0; m < nsubq_; m++) { 186 | const real* c = get_centroids(m, code[m]); 187 | if (m == nsubq_ - 1) { 188 | d = lastdsub_; 189 | } 190 | for (auto n = 0; n < d; n++) { 191 | res += x[m * dsub_ + n] * c[n]; 192 | } 193 | } 194 | return res * alpha; 195 | } 196 | 197 | void ProductQuantizer::addcode( 198 | Vector& x, 199 | const uint8_t* codes, 200 | int32_t t, 201 | real alpha) const { 202 | auto d = dsub_; 203 | const uint8_t* code = codes + nsubq_ * t; 204 | for (auto m = 0; m < nsubq_; m++) { 205 | const real* c = get_centroids(m, code[m]); 206 | if (m == nsubq_ - 1) { 207 | d = lastdsub_; 208 | } 209 | for (auto n = 0; n < d; n++) { 210 | x[m * dsub_ + n] += alpha * c[n]; 211 | } 212 | } 213 | } 214 | 215 | void ProductQuantizer::compute_code(const real* x, uint8_t* code) const { 216 | auto d = dsub_; 217 | for (auto m = 0; m < nsubq_; m++) { 218 | if (m == nsubq_ - 1) { 219 | d = lastdsub_; 220 | } 221 | assign_centroid(x + m * dsub_, get_centroids(m, 0), code + m, d); 222 | } 223 | } 224 | 225 | void ProductQuantizer::compute_codes(const real* x, uint8_t* codes, int32_t n) 226 | const { 227 | for (auto i = 0; i < n; i++) { 228 | compute_code(x + i * dim_, codes + i * nsubq_); 229 | } 230 | } 231 | 232 | void ProductQuantizer::save(std::ostream& out) const { 233 | out.write((char*)&dim_, sizeof(dim_)); 234 | out.write((char*)&nsubq_, sizeof(nsubq_)); 235 | out.write((char*)&dsub_, sizeof(dsub_)); 236 | out.write((char*)&lastdsub_, sizeof(lastdsub_)); 237 | out.write((char*)centroids_.data(), centroids_.size() * sizeof(real)); 238 | } 239 | 240 | void ProductQuantizer::load(std::istream& in) { 241 | in.read((char*)&dim_, sizeof(dim_)); 242 | in.read((char*)&nsubq_, sizeof(nsubq_)); 243 | in.read((char*)&dsub_, sizeof(dsub_)); 244 | in.read((char*)&lastdsub_, sizeof(lastdsub_)); 245 | centroids_.resize(dim_ * ksub_); 246 | for (auto i = 0; i < centroids_.size(); i++) { 247 | in.read((char*)¢roids_[i], sizeof(real)); 248 | } 249 | } 250 | 251 | } // namespace fasttext 252 | -------------------------------------------------------------------------------- /docs/reference/load_model.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Load an existing fastText trained model — load_model • fastrtext 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 116 | 117 | 118 |
    119 | 120 |
    121 |
    122 | 127 | 128 |
    129 | 130 |

    Load and return a pointer to an existing model which will be used in other functions of this package.

    131 | 132 |
    133 | 134 |
    load_model(path)
    135 | 136 |

    Arguments

    137 | 138 | 139 | 140 | 141 | 142 | 143 |
    path

    path to the existing model

    144 | 145 | 146 |

    Examples

    147 |
    148 | library(fastrtext) 149 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext") 150 | model <- load_model(model_test_path)
    151 |
    152 | 161 |
    162 | 163 |
    164 | 167 | 168 |
    169 |

    Site built with pkgdown 1.3.0.

    170 |
    171 |
    172 |
    173 | 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /docs/reference/add_prefix.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Add a prefix to each word — add_prefix • fastrtext 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 49 | 50 | 51 | 52 | 53 | 54 |
    55 |
    56 | 117 | 118 | 119 |
    120 | 121 |
    122 |
    123 | 128 | 129 |
    130 | 131 |

    Add a custom prefix to each word of a a line to create different spaces. 132 | Code in C++ (efficient).

    133 | 134 |
    135 | 136 |
    add_prefix(texts, prefix)
    137 | 138 |

    Arguments

    139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 |
    texts

    a character containing the original text

    prefix

    unit character containing the prefix to add (length == 1) or character with same length than texts

    150 | 151 |

    Value

    152 | 153 |

    character with prefixed words.

    154 | 155 | 156 |

    Examples

    157 |
    add_prefix(c("this is a test", "this is another test"), "#")
    #> [1] "#this #is #a #test" "#this #is #another #test"
    158 |
    159 | 170 |
    171 | 172 |
    173 | 176 | 177 |
    178 |

    Site built with pkgdown 1.3.0.

    179 |
    180 |
    181 |
    182 | 183 | 184 | 185 | 186 | 187 | 188 | -------------------------------------------------------------------------------- /docs/reference/get_word_distance.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Distance between two words — get_word_distance • fastrtext 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 116 | 117 | 118 |
    119 | 120 |
    121 |
    122 | 127 | 128 |
    129 | 130 |

    Distance is equal to 1 - cosine

    131 | 132 |
    133 | 134 |
    get_word_distance(model, w1, w2)
    135 | 136 |

    Arguments

    137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 |
    model

    trained fastText model. Null if train a new model.

    w1

    first word to compare

    w2

    second word to compare

    152 | 153 |

    Value

    154 | 155 |

    a scalar with the distance

    156 | 157 | 158 |

    Examples

    159 |
    160 | library(fastrtext) 161 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext") 162 | model <- load_model(model_test_path) 163 | get_word_distance(model, "time", "timing")
    #> [,1] 164 | #> [1,] 0.5868116
    165 |
    166 |
    167 | 178 |
    179 | 180 | 189 |
    190 | 191 | 192 | 193 | 194 | 195 | 196 | --------------------------------------------------------------------------------