├── LICENSE
├── tools
    ├── logo.png
    ├── logo.xcf
    ├── logo_black.xcf
    ├── logo_white.xcf
    └── fasttext-logo-color-web.png
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-prefix.R
    │   ├── test-unsupervised.R
    │   └── test-supervised.R
├── data
    ├── test_sentences.rda
    ├── train_sentences.rda
    └── stop_words_sentences.rda
├── inst
    ├── extdata
    │   ├── model_classification_test.bin
    │   └── model_unsupervised_test.bin
    └── include
    │   ├── fastrtext.h
    │   └── fastrtext_RcppExports.h
├── .gitignore
├── CRAN-RELEASE
├── cleanup
├── src
    ├── main.h
    ├── fasttext
    │   ├── real.h
    │   ├── matrix.cc
    │   ├── matrix.h
    │   ├── utils.cc
    │   ├── utils.h
    │   ├── vector.h
    │   ├── quantmatrix.h
    │   ├── productquantizer.h
    │   ├── meter.h
    │   ├── model.h
    │   ├── meter.cc
    │   ├── vector.cc
    │   ├── args.h
    │   ├── densematrix.h
    │   ├── model.cc
    │   ├── autotune.h
    │   ├── dictionary.h
    │   ├── quantmatrix.cc
    │   ├── loss.h
    │   ├── densematrix.cc
    │   ├── fasttext.h
    │   └── productquantizer.cc
    ├── r_compliance.cc
    ├── r_compliance.h
    ├── Makevars
    ├── add_prefix.cpp
    └── RcppExports.cpp
├── docs
    ├── pkgdown.yml
    ├── link.svg
    ├── docsearch.js
    ├── pkgdown.js
    ├── LICENSE-text.html
    ├── pkgdown.css
    ├── authors.html
    ├── news
    │   └── index.html
    ├── articles
    │   └── index.html
    └── reference
    │   ├── print_help.html
    │   ├── stop_words_sentences.html
    │   ├── Rcpp_fastrtext-class.html
    │   ├── load_model.html
    │   ├── add_prefix.html
    │   └── get_word_distance.html
├── .Rbuildignore
├── .travis.yml
├── man
    ├── print_help.Rd
    ├── stop_words_sentences.Rd
    ├── load_model.Rd
    ├── get_parameters.Rd
    ├── get_labels.Rd
    ├── get_dictionary.Rd
    ├── add_prefix.Rd
    ├── Rcpp_fastrtext-class.Rd
    ├── get_sentence_representation.Rd
    ├── get_word_distance.Rd
    ├── get_word_ids.Rd
    ├── get_tokenized_text.Rd
    ├── get_word_vectors.Rd
    ├── get_hamming_loss.Rd
    ├── get_nn.Rd
    ├── add_tags.Rd
    ├── fastrtext.Rd
    ├── predict.Rcpp_fastrtext.Rd
    ├── execute.Rd
    ├── test_sentences.Rd
    ├── train_sentences.Rd
    ├── build_vectors.Rd
    └── build_supervised.Rd
├── R
    ├── zzz.R
    ├── RcppExports.R
    └── data.R
├── NAMESPACE
├── _pkgdown.yml
├── appveyor.yml
├── vignettes
    ├── unsupervised_learning.Rmd
    ├── supervised_learning.Rmd
    └── list_commands.Rmd
├── data-raw
    └── create_models.R
├── DESCRIPTION
├── README.md
├── NEWS.md
├── cran-comments.md
└── index.md


/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2017
2 | COPYRIGHT HOLDER: Michaël Benesty


--------------------------------------------------------------------------------
/tools/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/tools/logo.png


--------------------------------------------------------------------------------
/tools/logo.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/tools/logo.xcf


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(fastrtext)
3 | 
4 | test_check("fastrtext")
5 | 


--------------------------------------------------------------------------------
/tools/logo_black.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/tools/logo_black.xcf


--------------------------------------------------------------------------------
/tools/logo_white.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/tools/logo_white.xcf


--------------------------------------------------------------------------------
/data/test_sentences.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/data/test_sentences.rda


--------------------------------------------------------------------------------
/data/train_sentences.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/data/train_sentences.rda


--------------------------------------------------------------------------------
/data/stop_words_sentences.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/data/stop_words_sentences.rda


--------------------------------------------------------------------------------
/tools/fasttext-logo-color-web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/tools/fasttext-logo-color-web.png


--------------------------------------------------------------------------------
/inst/extdata/model_classification_test.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/inst/extdata/model_classification_test.bin


--------------------------------------------------------------------------------
/inst/extdata/model_unsupervised_test.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pommedeterresautee/fastrtext/HEAD/inst/extdata/model_unsupervised_test.bin


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.Rproj
 2 | .Rproj.user
 3 | .Rhistory
 4 | .RData
 5 | build/*
 6 | *.o
 7 | *.so
 8 | *.dll
 9 | data-raw/*.bin
10 | data-raw/*.vec
11 | 


--------------------------------------------------------------------------------
/CRAN-RELEASE:
--------------------------------------------------------------------------------
1 | This package was submitted to CRAN on 2019-10-27.
2 | Once it is accepted, delete this file and tag the release (commit 7c1c7cdf4a).
3 | 


--------------------------------------------------------------------------------
/cleanup:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | rm -Rf src/*.o src/fasttext/*.o src/*.so src/fasttext/*.so src/*.dll src/fasttext/*.dll src/*.dylib src/fasttext/*.dylib src/symbols.rds
4 | 


--------------------------------------------------------------------------------
/src/main.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <iostream>
4 | 
5 | #include "fasttext/fasttext.h"
6 | #include "fasttext/args.h"
7 | 
8 | int main(int argc, char** argv);
9 | 


--------------------------------------------------------------------------------
/docs/pkgdown.yml:
--------------------------------------------------------------------------------
1 | pandoc: 2.3.1
2 | pkgdown: 1.3.0
3 | pkgdown_sha: ~
4 | articles:
5 |   list_commands: list_commands.html
6 |   supervised_learning: supervised_learning.html
7 |   unsupervised_learning: unsupervised_learning.html
8 | 
9 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^CRAN-RELEASE$
 2 | ^.*\.Rproj$
 3 | ^\.Rproj\.user$
 4 | ^\.travis\.yml$
 5 | ^appveyor\.yml$
 6 | ^/tools/*\.png$
 7 | ^docs$
 8 | ^README\.md$
 9 | ^cran-comments\.md$
10 | ^.*.\.o$
11 | ^_pkgdown\.yml$
12 | ^data-raw$
13 | index.md
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: R
 4 | sudo: false
 5 | cache: packages
 6 | 
 7 | r_github_packages:
 8 |   - r-lib/covr
 9 | 
10 | after_success:
11 |   - travis_wait 180 Rscript -e 'covr::codecov()'
12 | 


--------------------------------------------------------------------------------
/inst/include/fastrtext.h:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #ifndef RCPP_fastrtext_H_GEN_
 5 | #define RCPP_fastrtext_H_GEN_
 6 | 
 7 | #include "fastrtext_RcppExports.h"
 8 | 
 9 | #endif // RCPP_fastrtext_H_GEN_
10 | 


--------------------------------------------------------------------------------
/src/fasttext/real.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | namespace fasttext {
12 | 
13 | typedef float real;
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/man/print_help.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{print_help}
 4 | \alias{print_help}
 5 | \title{Print help}
 6 | \usage{
 7 | print_help()
 8 | }
 9 | \description{
10 | Print command information, mainly to use with \code{\link[=execute]{execute()}} \code{function}.
11 | }
12 | \examples{
13 | \dontrun{
14 | print_help()
15 | }
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/man/stop_words_sentences.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{stop_words_sentences}
 5 | \alias{stop_words_sentences}
 6 | \title{Stop words list}
 7 | \format{Character vector of stop words}
 8 | \source{
 9 | \url{https://archive.ics.uci.edu/ml/index.php}
10 | }
11 | \usage{
12 | stop_words_sentences
13 | }
14 | \description{
15 | List of words that can be safely removed from sentences.
16 | }
17 | \keyword{datasets}
18 | 


--------------------------------------------------------------------------------
/tests/testthat/test-prefix.R:
--------------------------------------------------------------------------------
 1 | context("test word prefix")
 2 | 
 3 | test_that("test unique prefix", {
 4 |   expect_equal(object = add_prefix(c("this is a test", "this is another test"), "#"),
 5 |                expected = c("#this #is #a #test", 
 6 |                             "#this #is #another #test"))
 7 | })
 8 | 
 9 | test_that("test multiple prefixes", {
10 |   expect_equal(object = add_prefix(c("this is a test", "this is another test"), c("#", "*")),
11 |                expected = c("#this #is #a #test", 
12 |                             "*this *is *another *test"))
13 | })
14 | 


--------------------------------------------------------------------------------
/src/fasttext/matrix.cc:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #include "matrix.h"
10 | 
11 | namespace fasttext {
12 | 
13 | Matrix::Matrix() : m_(0), n_(0) {}
14 | 
15 | Matrix::Matrix(int64_t m, int64_t n) : m_(m), n_(n) {}
16 | 
17 | int64_t Matrix::size(int64_t dim) const {
18 |   assert(dim == 0 || dim == 1);
19 |   if (dim == 0) {
20 |     return m_;
21 |   }
22 |   return n_;
23 | }
24 | 
25 | } // namespace fasttext
26 | 


--------------------------------------------------------------------------------
/man/load_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{load_model}
 4 | \alias{load_model}
 5 | \title{Load an existing fastText trained model}
 6 | \usage{
 7 | load_model(path)
 8 | }
 9 | \arguments{
10 | \item{path}{path to the existing model}
11 | }
12 | \description{
13 | Load and return a pointer to an existing model which will be used in other functions of this package.
14 | }
15 | \examples{
16 | 
17 | library(fastrtext)
18 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
19 | model <- load_model(model_test_path)
20 | }
21 | 


--------------------------------------------------------------------------------
/src/r_compliance.cc:
--------------------------------------------------------------------------------
 1 | // Content of this file is added to each source of fastText to change some behaviours
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <Rcpp.h>
 5 | #include <ostream>
 6 | #include "real.h"
 7 | 
 8 | void exit_fasttext(int status_code) {
 9 |   if (status_code != EXIT_SUCCESS) {
10 |     Rcpp::stop("Failure in fastrtext. Exit code: " + std::to_string(status_code));
11 |   }
12 | }
13 | 
14 | // catch interrupt from the user
15 | // void interrupt_or_print(double maxDuration) {
16 | //   Rcpp::checkUserInterrupt();
17 | //   printInfo(maxDuration);
18 | // }
19 | 
20 | namespace std {
21 |   std::ostream Rcout(Rcpp::Rcout.rdbuf());
22 | }
23 | 


--------------------------------------------------------------------------------
/man/get_parameters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{get_parameters}
 4 | \alias{get_parameters}
 5 | \title{Export hyper parameters}
 6 | \usage{
 7 | get_parameters(model)
 8 | }
 9 | \arguments{
10 | \item{model}{trained \code{fastText} model}
11 | }
12 | \value{
13 | \link{list} containing each parameter
14 | }
15 | \description{
16 | Retrieve hyper parameters used to train the model
17 | }
18 | \examples{
19 | 
20 | library(fastrtext)
21 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
22 | model <- load_model(model_test_path)
23 | print(head(get_parameters(model), 5))
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/man/get_labels.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{get_labels}
 4 | \alias{get_labels}
 5 | \title{Get list of labels (supervised model)}
 6 | \usage{
 7 | get_labels(model)
 8 | }
 9 | \arguments{
10 | \item{model}{trained \code{fastText} model}
11 | }
12 | \value{
13 | \link{character} containing each label
14 | }
15 | \description{
16 | Get a \link{character} containing each label seen during training.
17 | }
18 | \examples{
19 | 
20 | library(fastrtext)
21 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
22 | model <- load_model(model_test_path)
23 | print(head(get_labels(model), 5))
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/man/get_dictionary.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{get_dictionary}
 4 | \alias{get_dictionary}
 5 | \title{Get list of known words}
 6 | \usage{
 7 | get_dictionary(model)
 8 | }
 9 | \arguments{
10 | \item{model}{trained \code{fastText} model}
11 | }
12 | \value{
13 | \link{character} containing each word
14 | }
15 | \description{
16 | Get a \link{character} containing each word seen during training.
17 | }
18 | \examples{
19 | 
20 | library(fastrtext)
21 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
22 | model <- load_model(model_test_path)
23 | print(head(get_dictionary(model), 5))
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/man/add_prefix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{add_prefix}
 4 | \alias{add_prefix}
 5 | \title{Add a prefix to each word}
 6 | \usage{
 7 | add_prefix(texts, prefix)
 8 | }
 9 | \arguments{
10 | \item{texts}{a \link{character} containing the original text}
11 | 
12 | \item{prefix}{unit \link{character} containing the prefix to add (length == 1) or \link{character} with same length than texts}
13 | }
14 | \value{
15 | \link{character} with prefixed words.
16 | }
17 | \description{
18 | Add a custom prefix to each word of a a line to create different spaces.
19 | Code in C++ (efficient).
20 | }
21 | \examples{
22 | add_prefix(c("this is a test", "this is another    test"), "#")
23 | }
24 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | loadModule("FASTRTEXT_MODULE", TRUE)
 2 | 
 3 | #' @name fastrtext
 4 | #' @useDynLib fastrtext, .registration = TRUE
 5 | #' @importFrom Rcpp evalCpp loadModule cpp_object_initializer
 6 | #' @import methods
 7 | "_PACKAGE"
 8 | 
 9 | #' Rcpp_fastrtext class
10 | #'
11 | #' Models are [S4] objects with several slots (methods) which can be called that way: model$slot_name()
12 | #'
13 | #' @name Rcpp_fastrtext-class
14 | #'
15 | #' @slot load Load a model
16 | #' @slot predict Make a prediction
17 | #' @slot execute Execute commands
18 | #' @slot get_vectors Get vectors related to provided words
19 | #' @slot get_parameters Get parameters used to train the model
20 | #' @slot get_dictionary List all words learned
21 | #' @slot get_labels List all labels learned
22 | NULL
23 | 


--------------------------------------------------------------------------------
/man/Rcpp_fastrtext-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zzz.R
 3 | \name{Rcpp_fastrtext-class}
 4 | \alias{Rcpp_fastrtext-class}
 5 | \title{Rcpp_fastrtext class}
 6 | \description{
 7 | Models are \link{S4} objects with several slots (methods) which can be called that way: model$slot_name()
 8 | }
 9 | \section{Slots}{
10 | 
11 | \describe{
12 | \item{\code{load}}{Load a model}
13 | 
14 | \item{\code{predict}}{Make a prediction}
15 | 
16 | \item{\code{execute}}{Execute commands}
17 | 
18 | \item{\code{get_vectors}}{Get vectors related to provided words}
19 | 
20 | \item{\code{get_parameters}}{Get parameters used to train the model}
21 | 
22 | \item{\code{get_dictionary}}{List all words learned}
23 | 
24 | \item{\code{get_labels}}{List all labels learned}
25 | }}
26 | 
27 | 


--------------------------------------------------------------------------------
/man/get_sentence_representation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{get_sentence_representation}
 4 | \alias{get_sentence_representation}
 5 | \title{Get sentence embedding}
 6 | \usage{
 7 | get_sentence_representation(model, sentences)
 8 | }
 9 | \arguments{
10 | \item{model}{\code{fastText} model}
11 | 
12 | \item{sentences}{\link{character} containing the sentences}
13 | }
14 | \description{
15 | Sentence is splitted in words (using space characters), and word embeddings are averaged.
16 | }
17 | \examples{
18 | library(fastrtext)
19 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
20 | model <- load_model(model_test_path)
21 | m <- get_sentence_representation(model, "this is a test")
22 | print(m)
23 | }
24 | 


--------------------------------------------------------------------------------
/man/get_word_distance.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{get_word_distance}
 4 | \alias{get_word_distance}
 5 | \title{Distance between two words}
 6 | \usage{
 7 | get_word_distance(model, w1, w2)
 8 | }
 9 | \arguments{
10 | \item{model}{trained \code{fastText} model. Null if train a new model.}
11 | 
12 | \item{w1}{first word to compare}
13 | 
14 | \item{w2}{second word to compare}
15 | }
16 | \value{
17 | a \code{scalar} with the distance
18 | }
19 | \description{
20 | Distance is equal to \code{1 - cosine}
21 | }
22 | \examples{
23 | 
24 | library(fastrtext)
25 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
26 | model <- load_model(model_test_path)
27 | get_word_distance(model, "time", "timing")
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/r_compliance.h:
--------------------------------------------------------------------------------
 1 | // Content of this file is added to each source of fastText to change some behaviours
 2 | #pragma once
 3 | 
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <Rcpp.h>
 7 | #include <stdexcept>
 8 | 
 9 | #define exit(status_code) exit_fasttext(status_code)
10 | #define cerr Rcout // with cerr, no line refresh possible on R (it is an issue for learning with verbose set to 2, progress line is updated)
11 | #define cout Rcout
12 | #define main main_fastrtext // no direct call to main(), otherwise Cran complains + strange errors
13 | 
14 | 
15 | // catch the call to exit and call Rcpp::stop() when there is a fail
16 | void exit_fasttext(int error_code);
17 | 
18 | namespace std {
19 |   // Copy of Rcout in std namespace to reroute cout to R terminal with a macro
20 |   extern std::ostream Rcout;
21 | }
22 | 


--------------------------------------------------------------------------------
/docs/link.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 19.2.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 20 20" style="enable-background:new 0 0 20 20;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#75AADB;}
 7 | </style>
 8 | <path class="st0" d="M4,11.3h1.3v1.3H4c-2,0-4-2.3-4-4.7s2.1-4.7,4-4.7h5.3c1.9,0,4,2.3,4,4.7c0,1.9-1.2,3.6-2.7,4.3v-1.5
 9 | 	C11.4,10.2,12,9.1,12,8c0-1.7-1.4-3.3-2.7-3.3H4C2.7,4.7,1.3,6.3,1.3,8S2.7,11.3,4,11.3z M16,7.3h-1.3v1.3H16c1.3,0,2.7,1.6,2.7,3.3
10 | 	s-1.4,3.3-2.7,3.3h-5.3C9.4,15.3,8,13.7,8,12c0-1.1,0.6-2.2,1.3-2.8V7.7C7.9,8.4,6.7,10.1,6.7,12c0,2.4,2.1,4.7,4,4.7H16
11 | 	c1.9,0,4-2.3,4-4.7S18,7.3,16,7.3z"/>
12 | </svg>
13 | 


--------------------------------------------------------------------------------
/man/get_word_ids.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{get_word_ids}
 4 | \alias{get_word_ids}
 5 | \title{Retrieve word IDs}
 6 | \usage{
 7 | get_word_ids(model, words)
 8 | }
 9 | \arguments{
10 | \item{model}{\code{fastText} model}
11 | 
12 | \item{words}{\link{character} containing words to retrieve IDs}
13 | }
14 | \value{
15 | \link{numeric} of ids
16 | }
17 | \description{
18 | Get ID of words in the dictionary
19 | }
20 | \examples{
21 | library(fastrtext)
22 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
23 | model <- load_model(model_test_path)
24 | ids <- get_word_ids(model, c("this", "is", "a", "test"))
25 | 
26 | # print positions
27 | print(ids)
28 | # retrieve words in the dictionary using the positions retrieved
29 | print(get_dictionary(model)[ids])
30 | }
31 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(predict,Rcpp_fastrtext)
 4 | export(add_prefix)
 5 | export(add_tags)
 6 | export(build_supervised)
 7 | export(build_vectors)
 8 | export(execute)
 9 | export(get_dictionary)
10 | export(get_hamming_loss)
11 | export(get_labels)
12 | export(get_nn)
13 | export(get_parameters)
14 | export(get_sentence_representation)
15 | export(get_tokenized_text)
16 | export(get_word_distance)
17 | export(get_word_ids)
18 | export(get_word_vectors)
19 | export(load_model)
20 | export(print_help)
21 | import(methods)
22 | importFrom(Rcpp,cpp_object_initializer)
23 | importFrom(Rcpp,evalCpp)
24 | importFrom(Rcpp,loadModule)
25 | importFrom(assertthat,assert_that)
26 | importFrom(assertthat,is.count)
27 | importFrom(assertthat,is.flag)
28 | importFrom(assertthat,is.number)
29 | importFrom(assertthat,is.string)
30 | useDynLib(fastrtext, .registration = TRUE)
31 | 


--------------------------------------------------------------------------------
/man/get_tokenized_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{get_tokenized_text}
 4 | \alias{get_tokenized_text}
 5 | \title{Tokenize text}
 6 | \usage{
 7 | get_tokenized_text(model, texts)
 8 | }
 9 | \arguments{
10 | \item{model}{\code{fastText} model}
11 | 
12 | \item{texts}{a \link{character} containing the documents}
13 | }
14 | \value{
15 | a \link{list} of \link{character} containing words
16 | }
17 | \description{
18 | Separate words in a text using space characters
19 | }
20 | \examples{
21 | library(fastrtext)
22 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
23 | model <- load_model(model_test_path)
24 | tokens <- get_tokenized_text(model, "this is a test")
25 | print(tokens)
26 | tokens <- get_tokenized_text(model, c("this is a test 1", "this is a second test!"))
27 | print(tokens)
28 | }
29 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | reference:
 2 |   - title: "General"
 3 |     desc: "Functions useful in both supervised and unsupervised contexts."
 4 |     contents:
 5 |      - fastrtext
 6 |      - Rcpp_fastrtext-class
 7 |      - load_model
 8 |      - execute
 9 |      - get_parameters
10 |      - print_help
11 |   - title: "Supervised learning"
12 |     desc: "Function useful for text classification."
13 |     contents:
14 |       - predict.Rcpp_fastrtext
15 |       - get_hamming_loss
16 |       - get_labels
17 |   - title: "Unsupervised learning"
18 |     desc: "Functions useful to play with word representations."
19 |     contents:
20 |       - get_word_vectors
21 |       - get_word_distance
22 |       - get_nn
23 |       - get_dictionary
24 |   - title: data
25 |     desc: "Data embedded in the package for help and tests."
26 |     contents:
27 |       - train_sentences
28 |       - test_sentences
29 |       - stop_words_sentences
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/man/get_word_vectors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{get_word_vectors}
 4 | \alias{get_word_vectors}
 5 | \title{Get word embeddings}
 6 | \usage{
 7 | get_word_vectors(model, words = get_dictionary(model))
 8 | }
 9 | \arguments{
10 | \item{model}{trained \code{fastText} model}
11 | 
12 | \item{words}{\link{character} of words. Default: return every word from the dictionary.}
13 | }
14 | \value{
15 | \link{matrix} containing each word embedding as a row and \code{rownames} are populated with word strings.
16 | }
17 | \description{
18 | Return the vector representation of provided words (unsupervised training)
19 | or provided labels (supervised training).
20 | }
21 | \examples{
22 | 
23 | library(fastrtext)
24 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
25 | model <- load_model(model_test_path)
26 | get_word_vectors(model, c("introduction", "we"))
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/man/get_hamming_loss.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{get_hamming_loss}
 4 | \alias{get_hamming_loss}
 5 | \title{Hamming loss}
 6 | \usage{
 7 | get_hamming_loss(labels, predictions)
 8 | }
 9 | \arguments{
10 | \item{labels}{list of labels}
11 | 
12 | \item{predictions}{list returned by the predict command (including both the probability and the categories)}
13 | }
14 | \value{
15 | a \code{scalar} with the loss
16 | }
17 | \description{
18 | Compute the hamming loss. When there is only one category, this measure the accuracy.
19 | }
20 | \examples{
21 | 
22 | library(fastrtext)
23 | data("test_sentences")
24 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
25 | model <- load_model(model_test_path)
26 | sentences <- test_sentences[, "text"]
27 | test_labels <- test_sentences[, "class.text"]
28 | predictions <- predict(model, sentences)
29 | get_hamming_loss(as.list(test_labels), predictions)
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/man/get_nn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{get_nn}
 4 | \alias{get_nn}
 5 | \title{Get nearest neighbour vectors}
 6 | \usage{
 7 | get_nn(model, word, k)
 8 | }
 9 | \arguments{
10 | \item{model}{trained \code{fastText} model. Null if train a new model.}
11 | 
12 | \item{word}{reference word}
13 | 
14 | \item{k}{\link{integer} defining the number of results to return}
15 | }
16 | \value{
17 | \link{numeric} with distances with \link{names} as words
18 | }
19 | \description{
20 | Find the \code{k} words with the smallest distance.
21 | First execution can be slow because of precomputation.
22 | Search is done linearly, if your model is big you may want to use an approximate neighbour algorithm from other R packages (like RcppAnnoy).
23 | }
24 | \examples{
25 | 
26 | library(fastrtext)
27 | model_test_path <- system.file("extdata", "model_unsupervised_test.bin", package = "fastrtext")
28 | model <- load_model(model_test_path)
29 | get_nn(model, "time", 10)
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #' Add a prefix to each word
 5 | #'
 6 | #' Add a custom prefix to each word of a a line to create different spaces.
 7 | #' Code in C++ (efficient).
 8 | #'
 9 | #' @param texts a [character] containing the original text
10 | #' @param prefix unit [character] containing the prefix to add (length == 1) or [character] with same length than texts
11 | #' @return [character] with prefixed words.
12 | #' @examples
13 | #' add_prefix(c("this is a test", "this is another    test"), "#")
14 | #' @export
15 | add_prefix <- function(texts, prefix) {
16 |     .Call(`_fastrtext_add_prefix`, texts, prefix)
17 | }
18 | 
19 | add_pr <- function(line, prefix) {
20 |     .Call(`_fastrtext_add_pr`, line, prefix)
21 | }
22 | 
23 | # Register entry points for exported C++ functions
24 | methods::setLoadAction(function(ns) {
25 |     .Call('_fastrtext_RcppExport_registerCCallable', PACKAGE = 'fastrtext')
26 | })
27 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # DO NOT CHANGE the "init" and "install" sections below
 2 | 
 3 | # Download script file from GitHub
 4 | init:
 5 |   ps: |
 6 |         $ErrorActionPreference = "Stop"
 7 |         Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
 8 |         Import-Module '..\appveyor-tool.ps1'
 9 | 
10 | install:
11 |   ps: Bootstrap
12 | 
13 | cache:
14 |   - C:\RLibrary
15 | 
16 | # Adapt as necessary starting from here
17 | 
18 | build_script:
19 |   - travis-tool.sh install_deps
20 | 
21 | test_script:
22 |   - travis-tool.sh run_tests
23 | 
24 | on_failure:
25 |   - 7z a failure.zip *.Rcheck\*
26 |   - appveyor PushArtifact failure.zip
27 | 
28 | artifacts:
29 |   - path: '*.Rcheck\**\*.log'
30 |     name: Logs
31 | 
32 |   - path: '*.Rcheck\**\*.out'
33 |     name: Logs
34 | 
35 |   - path: '*.Rcheck\**\*.fail'
36 |     name: Logs
37 | 
38 |   - path: '*.Rcheck\**\*.Rout'
39 |     name: Logs
40 | 
41 |   - path: '\*_*.tar.gz'
42 |     name: Bits
43 | 
44 |   - path: '\*_*.zip'
45 |     name: Bits
46 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
 1 | PKGROOT = ./fasttext
 2 | 
 3 | CXX_STD = CXX11
 4 | 
 5 | # include adds a header to each file, ugly hack to block call to exit() and replace cerr by cout
 6 | PKG_CPPFLAGS = -pthread -include r_compliance.h -I$(PKGROOT)
 7 | 
 8 | # pthread is used for multithreading by fastText
 9 | PKG_LIBS = -pthread
10 | 
11 | OBJECTS = add_prefix.o r_compliance.o $(PKGROOT)/autotune.o $(PKGROOT)/args.o $(PKGROOT)/matrix.o $(PKGROOT)/dictionary.o $(PKGROOT)/loss.o $(PKGROOT)/productquantizer.o $(PKGROOT)/densematrix.o $(PKGROOT)/quantmatrix.o $(PKGROOT)/vector.o $(PKGROOT)/model.o $(PKGROOT)/utils.o $(PKGROOT)/meter.o $(PKGROOT)/fasttext.o $(PKGROOT)/main.o fastrtext.o RcppExports.o
12 | 
13 | # Reduce the size of the compiled library by removing unneeded debug information
14 | # Need to check if we are on Linux and if strip is installed
15 | # http://dirk.eddelbuettel.com/blog/2017/08/14/#009_compact_shared_libraries
16 | # strippedLib: $(SHLIB)
17 | # 		if test -e "/usr/bin/strip" && test -e "/bin/uname" && [[ `uname` == "Linux" ]] ; then /usr/bin/strip --strip-unneeded -K R_registerRoutines -K R_useDynamicSymbols $(SHLIB); fi
18 | 
19 | # .phony: strippedLib
20 | 


--------------------------------------------------------------------------------
/vignettes/unsupervised_learning.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Unsupervised learning"
 3 | author: "M. Benesty"
 4 | output: rmarkdown::html_vignette
 5 | date: "`r Sys.Date()`"
 6 | vignette: >
 7 |   %\VignetteEngine{knitr::rmarkdown}
 8 |   %\VignetteIndexEntry{Unsupervised learning}
 9 |   %\VignetteEncoding{UTF-8}
10 | ---
11 | 
12 | ```{r unsupervised_learning}
13 |  library(fastrtext)
14 |     
15 |     data("train_sentences")
16 |     data("test_sentences")
17 |     texts <- tolower(train_sentences[,"text"])
18 |     tmp_file_txt <- tempfile()
19 |     tmp_file_model <- tempfile()
20 |     writeLines(text = texts, con = tmp_file_txt)
21 |     execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
22 |  
23 |     model <- load_model(tmp_file_model)
24 |    
25 |     # test word extraction
26 |     dict <- get_dictionary(model)
27 |     print(head(dict, 5))
28 | 
29 |   # print vector
30 |   print(get_word_vectors(model, c("time", "timing")))
31 | 
32 |   # test word distance
33 |   get_word_distance(model, "time", "timing")
34 | 
35 |   # free memory
36 |   unlink(tmp_file_txt)
37 |   unlink(tmp_file_model)
38 |   rm(model)
39 |   gc()
40 | ```
41 | 


--------------------------------------------------------------------------------
/src/fasttext/matrix.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <cstdint>
12 | #include <istream>
13 | #include <ostream>
14 | #include <vector>
15 | 
16 | #include <assert.h>
17 | #include "real.h"
18 | 
19 | namespace fasttext {
20 | 
21 | class Vector;
22 | 
23 | class Matrix {
24 |  protected:
25 |   int64_t m_;
26 |   int64_t n_;
27 | 
28 |  public:
29 |   Matrix();
30 |   explicit Matrix(int64_t, int64_t);
31 |   virtual ~Matrix() = default;
32 | 
33 |   int64_t size(int64_t dim) const;
34 | 
35 |   virtual real dotRow(const Vector&, int64_t) const = 0;
36 |   virtual void addVectorToRow(const Vector&, int64_t, real) = 0;
37 |   virtual void addRowToVector(Vector& x, int32_t i) const = 0;
38 |   virtual void addRowToVector(Vector& x, int32_t i, real a) const = 0;
39 |   virtual void save(std::ostream&) const = 0;
40 |   virtual void load(std::istream&) = 0;
41 |   virtual void dump(std::ostream&) const = 0;
42 | };
43 | 
44 | } // namespace fasttext
45 | 


--------------------------------------------------------------------------------
/man/add_tags.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{add_tags}
 4 | \alias{add_tags}
 5 | \title{Add tags to documents}
 6 | \usage{
 7 | add_tags(documents, tags, prefix = "__label__", new_lines = " ")
 8 | }
 9 | \arguments{
10 | \item{documents}{texts to learn}
11 | 
12 | \item{tags}{labels provided as a \link{list} or a \link{vector}. There can be 1 or more per document.}
13 | 
14 | \item{prefix}{\link{character} to add in front of tag (\code{fastText} format)}
15 | 
16 | \item{new_lines}{Character that replaces new lines (\code{\\r\\n}), default is space.}
17 | }
18 | \value{
19 | \link{character} ready to be written in a file
20 | }
21 | \description{
22 | Add tags in the `fastText`` format.
23 | This format is require for the training step. As fastText doesn't support newlines inside documents
24 | (as newlines are delimiting documents) this function also ensures that there are absolutely no
25 | new lines. By default new lines are replaced by a single space.
26 | }
27 | \examples{
28 | library(fastrtext)
29 | tags <- list(c(1, 5), 0)
30 | documents <- c("this is a text", "this is another document")
31 | add_tags(documents = documents, tags = tags)
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/man/fastrtext.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zzz.R
 3 | \docType{package}
 4 | \name{fastrtext}
 5 | \alias{fastrtext}
 6 | \alias{fastrtext-package}
 7 | \title{fastrtext: 'fastText' Wrapper for Text Classification and Word Representation}
 8 | \description{
 9 | Learning text representations and text classifiers may rely
10 |   on the same simple and efficient approach. 'fastText' is an open-source, free, 
11 |   lightweight library that allows users to perform both tasks.
12 |   It transforms text into continuous vectors that can later
13 |   be used on many language related task.
14 |   It works on standard, generic hardware (no 'GPU' required).
15 |   It also includes model size reduction feature.
16 |   'fastText' original source code is available 
17 |   at <https://github.com/facebookresearch/fastText>.
18 | }
19 | \seealso{
20 | Useful links:
21 | \itemize{
22 |   \item \url{https://github.com/pommedeterresautee/fastrtext}
23 |   \item \url{https://pommedeterresautee.github.io/fastrtext/}
24 |   \item Report bugs at \url{https://github.com/pommedeterresautee/fastrtext/issues}
25 | }
26 | 
27 | }
28 | \author{
29 | \strong{Maintainer}: Michaël Benesty \email{michael@benesty.fr} [copyright holder]
30 | 
31 | Other contributors:
32 | \itemize{
33 |   \item Facebook, Inc \email{bojanowski@fb.com} [copyright holder]
34 | }
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/src/fasttext/utils.cc:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #include "utils.h"
10 | 
11 | #include <iomanip>
12 | #include <ios>
13 | 
14 | namespace fasttext {
15 | 
16 | namespace utils {
17 | 
18 | int64_t size(std::ifstream& ifs) {
19 |   ifs.seekg(std::streamoff(0), std::ios::end);
20 |   return ifs.tellg();
21 | }
22 | 
23 | void seek(std::ifstream& ifs, int64_t pos) {
24 |   ifs.clear();
25 |   ifs.seekg(std::streampos(pos));
26 | }
27 | 
28 | double getDuration(
29 |     const std::chrono::steady_clock::time_point& start,
30 |     const std::chrono::steady_clock::time_point& end) {
31 |   return std::chrono::duration_cast<std::chrono::duration<double>>(end - start)
32 |       .count();
33 | }
34 | 
35 | ClockPrint::ClockPrint(int32_t duration) : duration_(duration) {}
36 | 
37 | std::ostream& operator<<(std::ostream& out, const ClockPrint& me) {
38 |   int32_t etah = me.duration_ / 3600;
39 |   int32_t etam = (me.duration_ % 3600) / 60;
40 |   int32_t etas = (me.duration_ % 3600) % 60;
41 | 
42 |   out << std::setw(3) << etah << "h" << std::setw(2) << etam << "m";
43 |   out << std::setw(2) << etas << "s";
44 |   return out;
45 | }
46 | 
47 | } // namespace utils
48 | 
49 | } // namespace fasttext
50 | 


--------------------------------------------------------------------------------
/data-raw/create_models.R:
--------------------------------------------------------------------------------
 1 | # The purpose of this script is to create models
 2 | # used in tests.
 3 | 
 4 | require(fastrtext)
 5 | 
 6 | data("train_sentences")
 7 | data("test_sentences")
 8 | 
 9 | # Unsupervised
10 | texts <- tolower(train_sentences[, "text"])
11 | tmp_file_txt <- tempfile()
12 | tmp_file_model <- "./data-raw/model_unsupervised_test"
13 | writeLines(text = texts, con = tmp_file_txt)
14 | execute(commands = c("skipgram",
15 |                      "-input", tmp_file_txt,
16 |                      "-output", tmp_file_model,
17 |                      "-dim", 70,
18 |                      "-bucket", 1e3,
19 |                      "-epoch", 20))
20 | 
21 | # Supervised
22 | train_labels <- paste0("__label__", train_sentences[, "class.text"])
23 | train_texts <- tolower(train_sentences[, "text"])
24 | train_to_write <- paste(train_labels, train_texts)
25 | train_tmp_file_txt <- tempfile()
26 | tmp_file_model <- "./data-raw/model_classification_test"
27 | writeLines(text = train_to_write, con = train_tmp_file_txt)
28 | 
29 | test_labels <- paste0("__label__", test_sentences[, "class.text"])
30 | test_texts <- tolower(test_sentences[, "text"])
31 | test_to_write <- paste(test_labels, test_texts)
32 | 
33 | # learn model
34 | execute(commands =
35 |           c("supervised", 
36 |             "-input", train_tmp_file_txt,
37 |             "-output", tmp_file_model, 
38 |             "-dim", 20,
39 |             "-lr", 1,
40 |             "-epoch", 20, 
41 |             "-wordNgrams", 2,
42 |             "-bucket", 1e3))


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: fastrtext
 2 | Type: Package
 3 | Title: 'fastText' Wrapper for Text Classification and Word Representation
 4 | Version: 0.3.4
 5 | Date: 2019-10-27
 6 | Authors@R: c(person("Michaël", "Benesty", role = c("aut", "cre", "cph"), email = "michael@benesty.fr"),
 7 |   person("Facebook, Inc", role = c("cph"), email = "bojanowski@fb.com"))
 8 | Maintainer: Michaël Benesty <michael@benesty.fr>
 9 | Description: Learning text representations and text classifiers may rely
10 |   on the same simple and efficient approach. 'fastText' is an open-source, free, 
11 |   lightweight library that allows users to perform both tasks.
12 |   It transforms text into continuous vectors that can later
13 |   be used on many language related task.
14 |   It works on standard, generic hardware (no 'GPU' required).
15 |   It also includes model size reduction feature.
16 |   'fastText' original source code is available 
17 |   at <https://github.com/facebookresearch/fastText>.
18 | URL: https://github.com/pommedeterresautee/fastrtext, https://pommedeterresautee.github.io/fastrtext/
19 | BugReports: https://github.com/pommedeterresautee/fastrtext/issues
20 | License: MIT + file LICENSE
21 | Depends: R (>= 3.3)
22 | Imports: methods, 
23 |   Rcpp (>= 0.12.12),
24 |   assertthat
25 | Suggests: knitr,
26 |   testthat
27 | LinkingTo: Rcpp
28 | LazyData: true
29 | VignetteBuilder: knitr
30 | Roxygen: list(markdown = TRUE, roclets = c("rd", "collate", "namespace_roclet"))
31 | RoxygenNote: 6.1.1
32 | Encoding: UTF-8
33 | NeedsCompilation: yes
34 | 


--------------------------------------------------------------------------------
/src/fasttext/utils.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include "real.h"
12 | 
13 | #include <algorithm>
14 | #include <chrono>
15 | #include <fstream>
16 | #include <ostream>
17 | #include <vector>
18 | 
19 | #if defined(__clang__) || defined(__GNUC__)
20 | #define FASTTEXT_DEPRECATED(msg) __attribute__((__deprecated__(msg)))
21 | #elif defined(_MSC_VER)
22 | #define FASTTEXT_DEPRECATED(msg) __declspec(deprecated(msg))
23 | #else
24 | #define FASTTEXT_DEPRECATED(msg)
25 | #endif
26 | 
27 | namespace fasttext {
28 | 
29 | using Predictions = std::vector<std::pair<real, int32_t>>;
30 | 
31 | namespace utils {
32 | 
33 | int64_t size(std::ifstream&);
34 | 
35 | void seek(std::ifstream&, int64_t);
36 | 
37 | template <typename T>
38 | bool contains(const std::vector<T>& container, const T& value) {
39 |   return std::find(container.begin(), container.end(), value) !=
40 |       container.end();
41 | }
42 | 
43 | double getDuration(
44 |     const std::chrono::steady_clock::time_point& start,
45 |     const std::chrono::steady_clock::time_point& end);
46 | 
47 | class ClockPrint {
48 |  public:
49 |   explicit ClockPrint(int32_t duration);
50 |   friend std::ostream& operator<<(std::ostream& out, const ClockPrint& me);
51 | 
52 |  private:
53 |   int32_t duration_;
54 | };
55 | 
56 | } // namespace utils
57 | 
58 | } // namespace fasttext
59 | 


--------------------------------------------------------------------------------
/src/fasttext/vector.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <cstdint>
12 | #include <ostream>
13 | #include <vector>
14 | 
15 | #include "real.h"
16 | 
17 | namespace fasttext {
18 | 
19 | class Matrix;
20 | 
21 | class Vector {
22 |  protected:
23 |   std::vector<real> data_;
24 | 
25 |  public:
26 |   explicit Vector(int64_t);
27 |   Vector(const Vector&) = default;
28 |   Vector(Vector&&) noexcept = default;
29 |   Vector& operator=(const Vector&) = default;
30 |   Vector& operator=(Vector&&) = default;
31 | 
32 |   inline real* data() {
33 |     return data_.data();
34 |   }
35 |   inline const real* data() const {
36 |     return data_.data();
37 |   }
38 |   inline real& operator[](int64_t i) {
39 |     return data_[i];
40 |   }
41 |   inline const real& operator[](int64_t i) const {
42 |     return data_[i];
43 |   }
44 | 
45 |   inline int64_t size() const {
46 |     return data_.size();
47 |   }
48 |   void zero();
49 |   void mul(real);
50 |   real norm() const;
51 |   void addVector(const Vector& source);
52 |   void addVector(const Vector&, real);
53 |   void addRow(const Matrix&, int64_t);
54 |   void addRow(const Matrix&, int64_t, real);
55 |   void mul(const Matrix&, const Vector&);
56 |   int64_t argmax();
57 | };
58 | 
59 | std::ostream& operator<<(std::ostream&, const Vector&);
60 | 
61 | } // namespace fasttext
62 | 


--------------------------------------------------------------------------------
/man/predict.Rcpp_fastrtext.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{predict.Rcpp_fastrtext}
 4 | \alias{predict.Rcpp_fastrtext}
 5 | \title{Get predictions (for supervised model)}
 6 | \usage{
 7 | \method{predict}{Rcpp_fastrtext}(object, sentences, k = 1,
 8 |   simplify = FALSE, unlock_empty_predictions = FALSE, threshold = 0,
 9 |   ...)
10 | }
11 | \arguments{
12 | \item{object}{trained \code{fastText} model}
13 | 
14 | \item{sentences}{\link{character} containing the sentences}
15 | 
16 | \item{k}{will return the \code{k} most probable labels (default = 1)}
17 | 
18 | \item{simplify}{when \link{TRUE} and \code{k} = 1, function return a (flat) \link{numeric} instead of a \link{list}}
19 | 
20 | \item{unlock_empty_predictions}{\link{logical} to avoid crash when some predictions are not provided for some sentences because all their words have not been seen during training. This parameter should only be set to \link{TRUE} to debug.}
21 | 
22 | \item{threshold}{used to limit number of words used. (optional; 0.0 by default)}
23 | 
24 | \item{...}{not used}
25 | }
26 | \value{
27 | \link{list} containing for each sentence the probability to be associated with \code{k} labels.
28 | }
29 | \description{
30 | Apply the trained  model to new sentences.
31 | Average word embeddings and search most similar \code{label} vector.
32 | }
33 | \examples{
34 | 
35 | library(fastrtext)
36 | data("test_sentences")
37 | model_test_path <- system.file("extdata", "model_classification_test.bin", package = "fastrtext")
38 | model <- load_model(model_test_path)
39 | sentence <- test_sentences[1, "text"]
40 | print(predict(model, sentence))
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/fasttext/quantmatrix.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <cstdint>
12 | #include <istream>
13 | #include <ostream>
14 | 
15 | #include <memory>
16 | #include <vector>
17 | 
18 | #include "real.h"
19 | 
20 | #include "densematrix.h"
21 | #include "matrix.h"
22 | #include "vector.h"
23 | 
24 | #include "productquantizer.h"
25 | 
26 | namespace fasttext {
27 | 
28 | class QuantMatrix : public Matrix {
29 |  protected:
30 |   std::unique_ptr<ProductQuantizer> pq_;
31 |   std::unique_ptr<ProductQuantizer> npq_;
32 | 
33 |   std::vector<uint8_t> codes_;
34 |   std::vector<uint8_t> norm_codes_;
35 | 
36 |   bool qnorm_;
37 |   int32_t codesize_;
38 | 
39 |  public:
40 |   QuantMatrix();
41 |   QuantMatrix(DenseMatrix&&, int32_t, bool);
42 |   QuantMatrix(const QuantMatrix&) = delete;
43 |   QuantMatrix(QuantMatrix&&) = delete;
44 |   QuantMatrix& operator=(const QuantMatrix&) = delete;
45 |   QuantMatrix& operator=(QuantMatrix&&) = delete;
46 |   virtual ~QuantMatrix() noexcept override = default;
47 | 
48 |   void quantizeNorm(const Vector&);
49 |   void quantize(DenseMatrix&& mat);
50 | 
51 |   real dotRow(const Vector&, int64_t) const override;
52 |   void addVectorToRow(const Vector&, int64_t, real) override;
53 |   void addRowToVector(Vector& x, int32_t i) const override;
54 |   void addRowToVector(Vector& x, int32_t i, real a) const override;
55 |   void save(std::ostream&) const override;
56 |   void load(std::istream&) override;
57 |   void dump(std::ostream&) const override;
58 | };
59 | 
60 | } // namespace fasttext
61 | 


--------------------------------------------------------------------------------
/src/fasttext/productquantizer.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <cstring>
12 | #include <istream>
13 | #include <ostream>
14 | #include <random>
15 | #include <vector>
16 | 
17 | #include "real.h"
18 | #include "vector.h"
19 | 
20 | namespace fasttext {
21 | 
22 | class ProductQuantizer {
23 |  protected:
24 |   const int32_t nbits_ = 8;
25 |   const int32_t ksub_ = 1 << nbits_;
26 |   const int32_t max_points_per_cluster_ = 256;
27 |   const int32_t max_points_ = max_points_per_cluster_ * ksub_;
28 |   const int32_t seed_ = 1234;
29 |   const int32_t niter_ = 25;
30 |   const real eps_ = 1e-7;
31 | 
32 |   int32_t dim_;
33 |   int32_t nsubq_;
34 |   int32_t dsub_;
35 |   int32_t lastdsub_;
36 | 
37 |   std::vector<real> centroids_;
38 | 
39 |   std::minstd_rand rng;
40 | 
41 |  public:
42 |   ProductQuantizer() {}
43 |   ProductQuantizer(int32_t, int32_t);
44 | 
45 |   real* get_centroids(int32_t, uint8_t);
46 |   const real* get_centroids(int32_t, uint8_t) const;
47 | 
48 |   real assign_centroid(const real*, const real*, uint8_t*, int32_t) const;
49 |   void Estep(const real*, const real*, uint8_t*, int32_t, int32_t) const;
50 |   void MStep(const real*, real*, const uint8_t*, int32_t, int32_t);
51 |   void kmeans(const real*, real*, int32_t, int32_t);
52 |   void train(int, const real*);
53 | 
54 |   real mulcode(const Vector&, const uint8_t*, int32_t, real) const;
55 |   void addcode(Vector&, const uint8_t*, int32_t, real) const;
56 |   void compute_code(const real*, uint8_t*) const;
57 |   void compute_codes(const real*, uint8_t*, int32_t) const;
58 | 
59 |   void save(std::ostream&) const;
60 |   void load(std::istream&);
61 | };
62 | 
63 | } // namespace fasttext
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![fastrtext](https://github.com/pommedeterresautee/fastrtext/raw/master/tools/logo.png) 
 2 | =========
 3 | 
 4 | [![Travis-CI Build Status](https://travis-ci.org/pommedeterresautee/fastrtext.svg?branch=master)](https://travis-ci.org/pommedeterresautee/fastrtext)
 5 | [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/pommedeterresautee/fastrtext?branch=master&svg=true)](https://ci.appveyor.com/project/pommedeterresautee/fastrtext)
 6 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/fastrtext)](https://cran.r-project.org/package=fastrtext)
 7 | [![CRAN_time_from_release](https://www.r-pkg.org/badges/ago/fastrtext)](https://cran.r-project.org/package=fastrtext)
 8 | [![CRAN_Download](http://cranlogs.r-pkg.org/badges/fastrtext)](http://cran.rstudio.com/web/packages/fastrtext/index.html) 
 9 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
10 | [![codecov](https://codecov.io/gh/pommedeterresautee/fastrtext/branch/master/graph/badge.svg)](https://codecov.io/gh/pommedeterresautee/fastrtext)
11 | [![Follow](https://img.shields.io/twitter/follow/pommedeterre33.svg?style=social)](https://twitter.com/intent/follow?screen_name=pommedeterre33)
12 | 
13 | [R Documentation](https://pommedeterresautee.github.io/fastrtext/) | [Release Notes](https://github.com/pommedeterresautee/fastrtext/blob/master/NEWS.md) | [FAQ](https://fasttext.cc/docs/en/faqs.html) | [Multilingual pretrained models](https://fasttext.cc/docs/en/crawl-vectors.html)
14 | 
15 | R wrapper for [fastText](https://github.com/facebookresearch/fastText) C++ code from Facebook.
16 | 
17 | FastText is an open-source, free, lightweight library that allows users to learn text representations and text classifiers. It works on standard, generic hardware. Models can later be reduced in size to even fit on mobile devices.
18 | 
19 | 
20 | ## License
21 | 
22 | © Contributors, 2019. Licensed under a MIT license.
23 | 


--------------------------------------------------------------------------------
/man/execute.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{execute}
 4 | \alias{execute}
 5 | \title{Execute command on \code{fastText} model (including training)}
 6 | \usage{
 7 | execute(commands)
 8 | }
 9 | \arguments{
10 | \item{commands}{\link{character} of commands}
11 | }
12 | \description{
13 | Use the same commands than the one to use for the command line.
14 | }
15 | \examples{
16 | \dontrun{
17 | # Supervised learning example
18 | library(fastrtext)
19 | 
20 | data("train_sentences")
21 | data("test_sentences")
22 | 
23 | # prepare data
24 | tmp_file_model <- tempfile()
25 | 
26 | train_labels <- paste0("__label__", train_sentences[,"class.text"])
27 | train_texts <- tolower(train_sentences[,"text"])
28 | train_to_write <- paste(train_labels, train_texts)
29 | train_tmp_file_txt <- tempfile()
30 | writeLines(text = train_to_write, con = train_tmp_file_txt)
31 | 
32 | test_labels <- paste0("__label__", test_sentences[,"class.text"])
33 | test_texts <- tolower(test_sentences[,"text"])
34 | test_to_write <- paste(test_labels, test_texts)
35 | 
36 | # learn model
37 | execute(commands = c("supervised", "-input", train_tmp_file_txt,
38 |                      "-output", tmp_file_model, "-dim", 20, "-lr", 1,
39 |                      "-epoch", 20, "-wordNgrams", 2, "-verbose", 1))
40 | 
41 | model <- load_model(tmp_file_model)
42 | predict(model, sentences = test_sentences[1, "text"])
43 | 
44 | # Unsupervised learning example
45 | library(fastrtext)
46 | 
47 | data("train_sentences")
48 | data("test_sentences")
49 | texts <- tolower(train_sentences[,"text"])
50 | tmp_file_txt <- tempfile()
51 | tmp_file_model <- tempfile()
52 | writeLines(text = texts, con = tmp_file_txt)
53 | execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
54 | 
55 | model <- load_model(tmp_file_model)
56 | dict <- get_dictionary(model)
57 | get_word_vectors(model, head(dict, 5))
58 | }
59 | }
60 | 


--------------------------------------------------------------------------------
/man/test_sentences.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{test_sentences}
 5 | \alias{test_sentences}
 6 | \title{Sentence corpus - test part}
 7 | \format{2 data frame with 3117 rows and 2 variables:
 8 | \describe{
 9 | \item{text}{the sentences as a character vector}
10 | \item{class.text}{the category of the sentence}
11 | }}
12 | \source{
13 | \url{https://archive.ics.uci.edu/ml/index.php}
14 | }
15 | \usage{
16 | test_sentences
17 | }
18 | \description{
19 | This corpus contains sentences from
20 | the abstract and introduction of 30 scientific articles that have been
21 | annotated (i.e. labeled or tagged) according to a modified version of the
22 | Argumentative Zones annotation scheme.
23 | }
24 | \details{
25 | These 30 scientific articles come
26 | from three different domains:
27 | \enumerate{
28 | \item PLoS Computational Biology (PLOS)
29 | \item The machine learning repository on arXiv (ARXIV)
30 | \item The psychology journal Judgment and Decision Making (JDM)
31 | }
32 | 
33 | There are 10 articles from each domain. In addition to the labeled data, this
34 | corpus also contains a corresponding set of unlabeled articles. These unlabeled
35 | articles also come from PLOS, ARXIV, and JDM. There are 300 unlabeled articles
36 | from each domain (again, only the sentences from the abstract and
37 | introduction). These unlabeled articles can be used for unsupervised or
38 | semi-supervised approaches to sentence classification which rely on a small set
39 | of labeled data and a larger set of unlabeled data.
40 | 
41 | ===== References =====
42 | 
43 | S. Teufel and M. Moens. Summarizing scientific articles: experiments with
44 | relevance and rhetorical status. Computational Linguistics, 28(4):409-445,
45 | 2002.
46 | 
47 | S. Teufel. Argumentative zoning: information extraction from scientific
48 | text. PhD thesis, School of Informatics, University of Edinburgh, 1999.
49 | }
50 | \keyword{datasets}
51 | 


--------------------------------------------------------------------------------
/man/train_sentences.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{train_sentences}
 5 | \alias{train_sentences}
 6 | \title{Sentence corpus - train part}
 7 | \format{2 data frame with 3117 rows and 2 variables:
 8 | \describe{
 9 | \item{text}{the sentences as a character vector}
10 | \item{class.text}{the category of the sentence}
11 | }}
12 | \source{
13 | \url{https://archive.ics.uci.edu/ml/index.php}
14 | }
15 | \usage{
16 | train_sentences
17 | }
18 | \description{
19 | This corpus contains sentences from
20 | the abstract and introduction of 30 scientific articles that have been
21 | annotated (i.e. labeled or tagged) according to a modified version of the
22 | Argumentative Zones annotation scheme.
23 | }
24 | \details{
25 | These 30 scientific articles come
26 | from three different domains:
27 | \enumerate{
28 | \item PLoS Computational Biology (PLOS)
29 | \item The machine learning repository on arXiv (ARXIV)
30 | \item The psychology journal Judgment and Decision Making (JDM)
31 | }
32 | 
33 | There are 10 articles from each domain. In addition to the labeled data, this
34 | corpus also contains a corresponding set of unlabeled articles. These unlabeled
35 | articles also come from PLOS, ARXIV, and JDM. There are 300 unlabeled articles
36 | from each domain (again, only the sentences from the abstract and
37 | introduction). These unlabeled articles can be used for unsupervised or
38 | semi-supervised approaches to sentence classification which rely on a small set
39 | of labeled data and a larger set of unlabeled data.
40 | 
41 | ===== References =====
42 | 
43 | S. Teufel and M. Moens. Summarizing scientific articles: experiments with
44 | relevance and rhetorical status. Computational Linguistics, 28(4):409-445,
45 | 2002.
46 | 
47 | S. Teufel. Argumentative zoning: information extraction from scientific
48 | text. PhD thesis, School of Informatics, University of Edinburgh, 1999.
49 | }
50 | \keyword{datasets}
51 | 


--------------------------------------------------------------------------------
/src/fasttext/meter.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <unordered_map>
12 | #include <vector>
13 | 
14 | #include "dictionary.h"
15 | #include "real.h"
16 | #include "utils.h"
17 | 
18 | namespace fasttext {
19 | 
20 | class Meter {
21 |   struct Metrics {
22 |     uint64_t gold;
23 |     uint64_t predicted;
24 |     uint64_t predictedGold;
25 |     mutable std::vector<std::pair<real, real>> scoreVsTrue;
26 | 
27 |     Metrics() : gold(0), predicted(0), predictedGold(0) {}
28 | 
29 |     double precision() const {
30 |       if (predicted == 0) {
31 |         return std::numeric_limits<double>::quiet_NaN();
32 |       }
33 |       return predictedGold / double(predicted);
34 |     }
35 |     double recall() const {
36 |       if (gold == 0) {
37 |         return std::numeric_limits<double>::quiet_NaN();
38 |       }
39 |       return predictedGold / double(gold);
40 |     }
41 |     double f1Score() const {
42 |       if (predicted + gold == 0) {
43 |         return std::numeric_limits<double>::quiet_NaN();
44 |       }
45 |       return 2 * predictedGold / double(predicted + gold);
46 |     }
47 |   };
48 | 
49 |  public:
50 |   Meter() : metrics_(), nexamples_(0), labelMetrics_() {}
51 | 
52 |   void log(const std::vector<int32_t>& labels, const Predictions& predictions);
53 | 
54 |   double precision(int32_t);
55 |   double recall(int32_t);
56 |   double f1Score(int32_t);
57 |   double precision() const;
58 |   double recall() const;
59 |   double f1Score() const;
60 |   uint64_t nexamples() const {
61 |     return nexamples_;
62 |   }
63 |   void writeGeneralMetrics(std::ostream& out, int32_t k) const;
64 | 
65 |  private:
66 |   Metrics metrics_{};
67 |   uint64_t nexamples_;
68 |   std::unordered_map<int32_t, Metrics> labelMetrics_;
69 | };
70 | 
71 | } // namespace fasttext
72 | 


--------------------------------------------------------------------------------
/src/fasttext/model.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <memory>
12 | #include <random>
13 | #include <utility>
14 | #include <vector>
15 | 
16 | #include "matrix.h"
17 | #include "real.h"
18 | #include "utils.h"
19 | #include "vector.h"
20 | 
21 | namespace fasttext {
22 | 
23 | class Loss;
24 | 
25 | class Model {
26 |  protected:
27 |   std::shared_ptr<Matrix> wi_;
28 |   std::shared_ptr<Matrix> wo_;
29 |   std::shared_ptr<Loss> loss_;
30 |   bool normalizeGradient_;
31 | 
32 |  public:
33 |   Model(
34 |       std::shared_ptr<Matrix> wi,
35 |       std::shared_ptr<Matrix> wo,
36 |       std::shared_ptr<Loss> loss,
37 |       bool normalizeGradient);
38 |   Model(const Model& model) = delete;
39 |   Model(Model&& model) = delete;
40 |   Model& operator=(const Model& other) = delete;
41 |   Model& operator=(Model&& other) = delete;
42 | 
43 |   class State {
44 |    private:
45 |     real lossValue_;
46 |     int64_t nexamples_;
47 | 
48 |    public:
49 |     Vector hidden;
50 |     Vector output;
51 |     Vector grad;
52 |     std::minstd_rand rng;
53 | 
54 |     State(int32_t hiddenSize, int32_t outputSize, int32_t seed);
55 |     real getLoss() const;
56 |     void incrementNExamples(real loss);
57 |   };
58 | 
59 |   void predict(
60 |       const std::vector<int32_t>& input,
61 |       int32_t k,
62 |       real threshold,
63 |       Predictions& heap,
64 |       State& state) const;
65 |   void update(
66 |       const std::vector<int32_t>& input,
67 |       const std::vector<int32_t>& targets,
68 |       int32_t targetIndex,
69 |       real lr,
70 |       State& state);
71 |   void computeHidden(const std::vector<int32_t>& input, State& state) const;
72 | 
73 |   real std_log(real) const;
74 | 
75 |   static const int32_t kUnlimitedPredictions = -1;
76 |   static const int32_t kAllLabelsAsTarget = -1;
77 | };
78 | 
79 | } // namespace fasttext
80 | 


--------------------------------------------------------------------------------
/vignettes/supervised_learning.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Supervised learning"
 3 | author: "M. Benesty"
 4 | output: rmarkdown::html_vignette
 5 | date: "`r Sys.Date()`"
 6 | vignette: >
 7 |   %\VignetteEngine{knitr::rmarkdown}
 8 |   %\VignetteIndexEntry{Supervised learning}
 9 |   %\VignetteEncoding{UTF-8}
10 | ---
11 | 
12 | ```{r supervised_learning}
13 | library(fastrtext)
14 | 
15 | data("train_sentences")
16 | data("test_sentences")
17 | 
18 | # prepare data
19 | tmp_file_model <- tempfile()
20 | 
21 | train_labels <- paste0("__label__", train_sentences[,"class.text"])
22 | train_texts <- tolower(train_sentences[,"text"])
23 | train_to_write <- paste(train_labels, train_texts)
24 | train_tmp_file_txt <- tempfile()
25 | writeLines(text = train_to_write, con = train_tmp_file_txt)
26 | 
27 | test_labels <- paste0("__label__", test_sentences[,"class.text"])
28 | test_labels_without_prefix <- test_sentences[,"class.text"]
29 | test_texts <- tolower(test_sentences[,"text"])
30 | test_to_write <- paste(test_labels, test_texts)
31 | 
32 | # learn model
33 | execute(commands = c("supervised", "-input", train_tmp_file_txt, "-output", tmp_file_model, "-dim", 20, "-lr", 1, "-epoch", 20, "-wordNgrams", 2, "-verbose", 1))
34 | 
35 | # load model
36 | model <- load_model(tmp_file_model)
37 | 
38 | # prediction are returned as a list with words and probabilities
39 | predictions <- predict(model, sentences = test_to_write)
40 | print(head(predictions, 5))
41 | 
42 | # Compute accuracy
43 | mean(names(unlist(predictions)) == test_labels_without_prefix)
44 | 
45 | # because there is only one category by observation, hamming loss will be the same
46 | get_hamming_loss(as.list(test_labels_without_prefix), predictions)
47 | 
48 | # test predictions
49 | predictions <- predict(model, sentences = test_to_write)
50 | print(head(predictions, 5))
51 | 
52 | # you can get flat list of results when you are retrieving only one label per observation
53 | print(head(predict(model, sentences = test_to_write, simplify = TRUE)))
54 | 
55 | # free memory
56 | unlink(train_tmp_file_txt)
57 | unlink(tmp_file_model)
58 | rm(model)
59 | gc()
60 | ```
61 | 


--------------------------------------------------------------------------------
/src/add_prefix.cpp:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::plugins("cpp11")]]
 2 | // [[Rcpp::interfaces(r, cpp)]]
 3 | 
 4 | #include <Rcpp.h>
 5 | using namespace Rcpp;
 6 | 
 7 | std::string add_pr(const std::string& line, const std::string& prefix);
 8 | 
 9 | //' Add a prefix to each word
10 | //'
11 | //' Add a custom prefix to each word of a a line to create different spaces.
12 | //' Code in C++ (efficient).
13 | //'
14 | //' @param texts a [character] containing the original text
15 | //' @param prefix unit [character] containing the prefix to add (length == 1) or [character] with same length than texts
16 | //' @return [character] with prefixed words.
17 | //' @examples
18 | //' add_prefix(c("this is a test", "this is another    test"), "#")
19 | //' @export
20 | // [[Rcpp::export]]
21 | CharacterVector add_prefix(const CharacterVector& texts, CharacterVector prefix) {
22 | 
23 |   const bool unique_prefix = prefix.size() == 1;
24 | 
25 |   if (!unique_prefix && prefix.size() != texts.size()) {
26 |     stop("prefix should be a single string or the same size than text");
27 |   }
28 | 
29 |   std::string current_prefix;
30 | 
31 |   if (unique_prefix) {
32 |     current_prefix = as<std::string>(prefix[0]);
33 |   }
34 | 
35 |   CharacterVector result(texts.size());
36 | 
37 |   for (R_len_t i = 0; i < texts.size(); ++i) {
38 |     if (!unique_prefix) {
39 |       current_prefix = as<std::string>(prefix[i]);
40 |     }
41 |     result[i] = add_pr(as<std::string>(texts[i]), current_prefix);
42 |   }
43 |   return result;
44 | }
45 | 
46 | // [[Rcpp::export]]
47 | std::string add_pr(const std::string& line, const std::string& prefix) {
48 |   if (line.size() % 10 == 0) checkUserInterrupt();
49 | 
50 |   std::string result;
51 |   result.reserve(line.size() * 1.5);
52 | 
53 |   bool last_char_is_space = true;
54 |   bool current_char_is_space;
55 |   for (const char& current_char: line) {
56 |     current_char_is_space = (current_char == ' ') | (current_char == '\t');
57 |     if (last_char_is_space && !current_char_is_space) {
58 |       result += prefix;
59 |     }
60 | 
61 |     last_char_is_space = current_char_is_space;
62 |     result += current_char;
63 |   }
64 |   return result;
65 | }
66 | 


--------------------------------------------------------------------------------
/src/fasttext/meter.cc:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #include "meter.h"
10 | #include "utils.h"
11 | 
12 | #include <algorithm>
13 | #include <cmath>
14 | #include <iomanip>
15 | #include <limits>
16 | 
17 | namespace fasttext {
18 | 
19 | void Meter::log(
20 |     const std::vector<int32_t>& labels,
21 |     const Predictions& predictions) {
22 |   nexamples_++;
23 |   metrics_.gold += labels.size();
24 |   metrics_.predicted += predictions.size();
25 | 
26 |   for (const auto& prediction : predictions) {
27 |     labelMetrics_[prediction.second].predicted++;
28 | 
29 |     real score = std::exp(prediction.first);
30 |     real gold = 0.0;
31 |     if (utils::contains(labels, prediction.second)) {
32 |       labelMetrics_[prediction.second].predictedGold++;
33 |       metrics_.predictedGold++;
34 |       gold = 1.0;
35 |     }
36 |     labelMetrics_[prediction.second].scoreVsTrue.emplace_back(score, gold);
37 |   }
38 | 
39 |   for (const auto& label : labels) {
40 |     labelMetrics_[label].gold++;
41 |   }
42 | }
43 | 
44 | double Meter::precision(int32_t i) {
45 |   return labelMetrics_[i].precision();
46 | }
47 | 
48 | double Meter::recall(int32_t i) {
49 |   return labelMetrics_[i].recall();
50 | }
51 | 
52 | double Meter::f1Score(int32_t i) {
53 |   return labelMetrics_[i].f1Score();
54 | }
55 | 
56 | double Meter::precision() const {
57 |   return metrics_.precision();
58 | }
59 | 
60 | double Meter::recall() const {
61 |   return metrics_.recall();
62 | }
63 | 
64 | double Meter::f1Score() const {
65 |   const double precision = this->precision();
66 |   const double recall = this->recall();
67 |   if (precision + recall != 0) {
68 |     return 2 * precision * recall / (precision + recall);
69 |   }
70 |   return std::numeric_limits<double>::quiet_NaN();
71 | }
72 | 
73 | void Meter::writeGeneralMetrics(std::ostream& out, int32_t k) const {
74 |   out << "N"
75 |       << "\t" << nexamples_ << std::endl;
76 |   out << std::setprecision(3);
77 |   out << "P@" << k << "\t" << metrics_.precision() << std::endl;
78 |   out << "R@" << k << "\t" << metrics_.recall() << std::endl;
79 | }
80 | 
81 | } // namespace fasttext
82 | 


--------------------------------------------------------------------------------
/man/build_vectors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{build_vectors}
 4 | \alias{build_vectors}
 5 | \title{Build fasttext vectors}
 6 | \usage{
 7 | build_vectors(documents, model_path, modeltype = c("skipgram", "cbow"),
 8 |   bucket = 2e+06, dim = 100, epoch = 5, label = "__label__",
 9 |   loss = c("ns", "hs", "softmax", "ova", "one-vs-all"), lr = 0.05,
10 |   lrUpdateRate = 100, maxn = 6, minCount = 5, minn = 3, neg = 5,
11 |   t = 1e-04, thread = 12, verbose = 2, wordNgrams = 1, ws = 5)
12 | }
13 | \arguments{
14 | \item{documents}{character vector of documents used for training}
15 | 
16 | \item{model_path}{Name of output file \emph{without} file extension.}
17 | 
18 | \item{modeltype}{Should training be done using skipgram or cbow? Defaults to skipgram.}
19 | 
20 | \item{bucket}{number of buckets}
21 | 
22 | \item{dim}{size of word vectors}
23 | 
24 | \item{epoch}{number of epochs}
25 | 
26 | \item{label}{text string, labels prefix. Default is "\strong{label}"}
27 | 
28 | \item{loss}{loss function {ns, hs, softmax}}
29 | 
30 | \item{lr}{learning rate}
31 | 
32 | \item{lrUpdateRate}{change the rate of updates for the learning rate}
33 | 
34 | \item{maxn}{max length of char ngram}
35 | 
36 | \item{minCount}{minimal number of word occurences}
37 | 
38 | \item{minn}{min length of char ngram}
39 | 
40 | \item{neg}{number of negatives sampled}
41 | 
42 | \item{t}{sampling threshold}
43 | 
44 | \item{thread}{number of threads}
45 | 
46 | \item{verbose}{verbosity level}
47 | 
48 | \item{wordNgrams}{max length of word ngram}
49 | 
50 | \item{ws}{size of the context window}
51 | }
52 | \value{
53 | path to model file, as character
54 | }
55 | \description{
56 | Trains a fasttext vector/unsupervised model following method described in
57 | \href{https://arxiv.org/abs/1607.04606}{Enriching Word Vectors with Subword Information}
58 | using the \href{https://fasttext.cc/}{fasttext} implementation.
59 | 
60 | See \href{https://fasttext.cc/docs/en/unsupervised-tutorial.html}{FastText word representation tutorial} for more information on
61 | training unsupervised models using fasttext.
62 | }
63 | \examples{
64 | \dontrun{
65 | library(fastrtext)
66 | text <- train_sentences
67 | model_file <- build_vectors(text[['text']], 'my_model')
68 | model <- load_model(model_file)
69 | }
70 | }
71 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # 0.3.4 (10/27/19)
 2 |   
 3 |   * remove deprecated code to fix Cran warnings
 4 |   * update to last FastText version
 5 |   * support one vs all loss
 6 |   * less macro (remove possibility to stop learning with CTRL+C)
 7 | 
 8 | # 0.3.2 (10.04.19)
 9 | 
10 |   * following Cran email, remove debug symbols stripping
11 |   * make unit tests stronger
12 | 
13 | # 0.3.1 (07.03.19)
14 | 
15 |   * update the C++ source code to the current fassttext version
16 |   * remove analogies function
17 |   * update error message
18 |   * add function to add prefix to words (to create different spaces in the same dataset, useful for classification in particular)
19 |   * simplify tests
20 | 
21 | # 0.2.6 (31.1.19)
22 | 
23 |   * use -pthread flag for better Cran compliancy
24 | 
25 | # 0.2.5 (4.1.18)
26 | 
27 |   * add get word id function
28 |   * add tokenizer function
29 |   * change the way sentence vector are computed (use fastText internal code to improve speed)
30 |   * remove RcppThread due to change in FB source code (no more print from multiple at the same time thread)
31 |   * add possibility to interrupte learning
32 | 
33 | # 0.2.4 (9.12.17)
34 | 
35 |   * major refactoring
36 |   * update to last version of fastText sourcecode
37 |   * sentence representation function
38 |   * add tags function
39 |   * fix compilation on Windows R Dev
40 |   * better Makevars (related to strippedLib task)
41 | 
42 | # 0.2.3 (9.11.17)
43 | 
44 |   * fix a cran note related to the DESCRIPTION file
45 |   * remove documentation not anymore useful because of previous update
46 |   * add some asserts to avoid the case where some sentences have no prediction because all their words are unknown (not seen during training)
47 |   * fix compilation on Mac OS
48 | 
49 | # 0.2.2 (07.11.17)
50 | 
51 |   * make possible to interrupt long computation (not for model training part)
52 |   * add simplify option to predict (to get flat vector as a result)
53 |   * remove prefix label in predict result
54 |   * update fastText source code
55 |   * fix crash when learning and setting verbose to 2 (calling Rcout from multiple threads crash the application)
56 | 
57 | # 0.2.1 (18.09.17)
58 | 
59 |   * fix small bugs in compilation (mostly for mac os)
60 |   * remove all notes (Cran)
61 | 
62 | # 0.2.0 (15.09.17)
63 | 
64 |   * first Cran release
65 |   * covers all basic features of fastText
66 | 


--------------------------------------------------------------------------------
/vignettes/list_commands.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "List of commands"
 3 | author: "M. Benesty"
 4 | output: rmarkdown::html_vignette
 5 | date: "`r Sys.Date()`"
 6 | vignette: >
 7 |   %\VignetteEngine{knitr::rmarkdown}
 8 |   %\VignetteIndexEntry{List of commands}
 9 |   %\VignetteEncoding{UTF-8}
10 | ---
11 | 
12 | As seen in the other tutorials, `execute()` function works in a similar way to the command line client.  
13 | To list the commands, you just need to enter the following function:
14 | 
15 | ```{R eval=FALSE}
16 | library(fastrtext)
17 | 
18 | print_help()
19 | ```
20 | 
21 | ```
22 | The following arguments are mandatory:
23 |   -input              training file path
24 |   -output             output file path
25 | 
26 | The following arguments are optional:
27 |   -verbose            verbosity level [2]
28 | 
29 | The following arguments for the dictionary are optional:
30 |   -minCount           minimal number of word occurrences [1]
31 |   -minCountLabel      minimal number of label occurrences [0]
32 |   -wordNgrams         max length of word ngram [1]
33 |   -bucket             number of buckets [2000000]
34 |   -minn               min length of char ngram [0]
35 |   -maxn               max length of char ngram [0]
36 |   -t                  sampling threshold [0.0001]
37 |   -label              labels prefix [__label__]
38 | 
39 | The following arguments for training are optional:
40 |   -lr                 learning rate [0.1]
41 |   -lrUpdateRate       change the rate of updates for the learning rate [100]
42 |   -dim                size of word vectors [100]
43 |   -ws                 size of the context window [5]
44 |   -epoch              number of epochs [5]
45 |   -neg                number of negatives sampled [5]
46 |   -loss               loss function {ns, hs, softmax} [softmax]
47 |   -thread             number of threads [12]
48 |   -pretrainedVectors  pretrained word vectors for supervised learning []
49 |   -saveOutput         whether output params should be saved [0]
50 | 
51 | The following arguments for quantization are optional:
52 |   -cutoff             number of words and ngrams to retain [0]
53 |   -retrain            finetune embeddings if a cutoff is applied [0]
54 |   -qnorm              quantizing the norm separately [0]
55 |   -qout               quantizing the classifier [0]
56 |   -dsub               size of each sub-vector [2]
57 | ```
58 | 


--------------------------------------------------------------------------------
/docs/docsearch.js:
--------------------------------------------------------------------------------
 1 | $(function() {
 2 | 
 3 |   // register a handler to move the focus to the search bar
 4 |   // upon pressing shift + "/" (i.e. "?")
 5 |   $(document).on('keydown', function(e) {
 6 |     if (e.shiftKey && e.keyCode == 191) {
 7 |       e.preventDefault();
 8 |       $("#search-input").focus();
 9 |     }
10 |   });
11 | 
12 |   $(document).ready(function() {
13 |     // do keyword highlighting
14 |     /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */
15 |     var mark = function() {
16 | 
17 |       var referrer = document.URL ;
18 |       var paramKey = "q" ;
19 | 
20 |       if (referrer.indexOf("?") !== -1) {
21 |         var qs = referrer.substr(referrer.indexOf('?') + 1);
22 |         var qs_noanchor = qs.split('#')[0];
23 |         var qsa = qs_noanchor.split('&');
24 |         var keyword = "";
25 | 
26 |         for (var i = 0; i < qsa.length; i++) {
27 |           var currentParam = qsa[i].split('=');
28 | 
29 |           if (currentParam.length !== 2) {
30 |             continue;
31 |           }
32 | 
33 |           if (currentParam[0] == paramKey) {
34 |             keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20"));
35 |           }
36 |         }
37 | 
38 |         if (keyword !== "") {
39 |           $(".contents").unmark({
40 |             done: function() {
41 |               $(".contents").mark(keyword);
42 |             }
43 |           });
44 |         }
45 |       }
46 |     };
47 | 
48 |     mark();
49 |   });
50 | });
51 | 
52 | /* Search term highlighting ------------------------------*/
53 | 
54 | function matchedWords(hit) {
55 |   var words = [];
56 | 
57 |   var hierarchy = hit._highlightResult.hierarchy;
58 |   // loop to fetch from lvl0, lvl1, etc.
59 |   for (var idx in hierarchy) {
60 |     words = words.concat(hierarchy[idx].matchedWords);
61 |   }
62 | 
63 |   var content = hit._highlightResult.content;
64 |   if (content) {
65 |     words = words.concat(content.matchedWords);
66 |   }
67 | 
68 |   // return unique words
69 |   var words_uniq = [...new Set(words)];
70 |   return words_uniq;
71 | }
72 | 
73 | function updateHitURL(hit) {
74 | 
75 |   var words = matchedWords(hit);
76 |   var url = "";
77 | 
78 |   if (hit.anchor) {
79 |     url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor;
80 |   } else {
81 |     url = hit.url + '?q=' + escape(words.join(" "));
82 |   }
83 | 
84 |   return url;
85 | }
86 | 


--------------------------------------------------------------------------------
/src/fasttext/vector.cc:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #include "vector.h"
10 | 
11 | #include <assert.h>
12 | 
13 | #include <cmath>
14 | #include <iomanip>
15 | 
16 | #include "matrix.h"
17 | 
18 | namespace fasttext {
19 | 
20 | Vector::Vector(int64_t m) : data_(m) {}
21 | 
22 | void Vector::zero() {
23 |   std::fill(data_.begin(), data_.end(), 0.0);
24 | }
25 | 
26 | real Vector::norm() const {
27 |   real sum = 0;
28 |   for (int64_t i = 0; i < size(); i++) {
29 |     sum += data_[i] * data_[i];
30 |   }
31 |   return std::sqrt(sum);
32 | }
33 | 
34 | void Vector::mul(real a) {
35 |   for (int64_t i = 0; i < size(); i++) {
36 |     data_[i] *= a;
37 |   }
38 | }
39 | 
40 | void Vector::addVector(const Vector& source) {
41 |   assert(size() == source.size());
42 |   for (int64_t i = 0; i < size(); i++) {
43 |     data_[i] += source.data_[i];
44 |   }
45 | }
46 | 
47 | void Vector::addVector(const Vector& source, real s) {
48 |   assert(size() == source.size());
49 |   for (int64_t i = 0; i < size(); i++) {
50 |     data_[i] += s * source.data_[i];
51 |   }
52 | }
53 | 
54 | void Vector::addRow(const Matrix& A, int64_t i, real a) {
55 |   assert(i >= 0);
56 |   assert(i < A.size(0));
57 |   assert(size() == A.size(1));
58 |   A.addRowToVector(*this, i, a);
59 | }
60 | 
61 | void Vector::addRow(const Matrix& A, int64_t i) {
62 |   assert(i >= 0);
63 |   assert(i < A.size(0));
64 |   assert(size() == A.size(1));
65 |   A.addRowToVector(*this, i);
66 | }
67 | 
68 | void Vector::mul(const Matrix& A, const Vector& vec) {
69 |   assert(A.size(0) == size());
70 |   assert(A.size(1) == vec.size());
71 |   for (int64_t i = 0; i < size(); i++) {
72 |     data_[i] = A.dotRow(vec, i);
73 |   }
74 | }
75 | 
76 | int64_t Vector::argmax() {
77 |   real max = data_[0];
78 |   int64_t argmax = 0;
79 |   for (int64_t i = 1; i < size(); i++) {
80 |     if (data_[i] > max) {
81 |       max = data_[i];
82 |       argmax = i;
83 |     }
84 |   }
85 |   return argmax;
86 | }
87 | 
88 | std::ostream& operator<<(std::ostream& os, const Vector& v) {
89 |   os << std::setprecision(5);
90 |   for (int64_t j = 0; j < v.size(); j++) {
91 |     os << v[j] << ' ';
92 |   }
93 |   return os;
94 | }
95 | 
96 | } // namespace fasttext
97 | 


--------------------------------------------------------------------------------
/src/fasttext/args.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <istream>
12 | #include <ostream>
13 | #include <string>
14 | #include <unordered_set>
15 | #include <vector>
16 | 
17 | namespace fasttext {
18 | 
19 | enum class model_name : int { cbow = 1, sg, sup };
20 | enum class loss_name : int { hs = 1, ns, softmax, ova };
21 | enum class metric_name : int { f1score = 1, labelf1score };
22 | 
23 | class Args {
24 |  protected:
25 |   std::string boolToString(bool) const;
26 |   std::string modelToString(model_name) const;
27 |   std::string metricToString(metric_name) const;
28 |   std::unordered_set<std::string> manualArgs_;
29 | 
30 |  public:
31 |   Args();
32 |   std::string input;
33 |   std::string output;
34 |   double lr;
35 |   int lrUpdateRate;
36 |   int dim;
37 |   int ws;
38 |   int epoch;
39 |   int minCount;
40 |   int minCountLabel;
41 |   int neg;
42 |   int wordNgrams;
43 |   loss_name loss;
44 |   model_name model;
45 |   int bucket;
46 |   int minn;
47 |   int maxn;
48 |   int thread;
49 |   double t;
50 |   std::string label;
51 |   int verbose;
52 |   std::string pretrainedVectors;
53 |   bool saveOutput;
54 |   int seed;
55 | 
56 |   bool qout;
57 |   bool retrain;
58 |   bool qnorm;
59 |   size_t cutoff;
60 |   size_t dsub;
61 | 
62 |   std::string autotuneValidationFile;
63 |   std::string autotuneMetric;
64 |   int autotunePredictions;
65 |   int autotuneDuration;
66 |   std::string autotuneModelSize;
67 | 
68 |   void parseArgs(const std::vector<std::string>& args);
69 |   void printHelp();
70 |   void printBasicHelp();
71 |   void printDictionaryHelp();
72 |   void printTrainingHelp();
73 |   void printAutotuneHelp();
74 |   void printQuantizationHelp();
75 |   void save(std::ostream&);
76 |   void load(std::istream&);
77 |   void dump(std::ostream&) const;
78 |   bool hasAutotune() const;
79 |   bool isManual(const std::string& argName) const;
80 |   void setManual(const std::string& argName);
81 |   std::string lossToString(loss_name) const;
82 |   metric_name getAutotuneMetric() const;
83 |   std::string getAutotuneMetricLabel() const;
84 |   int64_t getAutotuneModelSize() const;
85 | 
86 |   static constexpr double kUnlimitedModelSize = -1.0;
87 | };
88 | } // namespace fasttext
89 | 


--------------------------------------------------------------------------------
/src/fasttext/densematrix.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <assert.h>
12 | #include <cstdint>
13 | #include <istream>
14 | #include <ostream>
15 | #include <stdexcept>
16 | #include <vector>
17 | 
18 | #include "matrix.h"
19 | #include "real.h"
20 | 
21 | namespace fasttext {
22 | 
23 | class Vector;
24 | 
25 | class DenseMatrix : public Matrix {
26 |  protected:
27 |   std::vector<real> data_;
28 |   void uniformThread(real, int, int32_t);
29 | 
30 |  public:
31 |   DenseMatrix();
32 |   explicit DenseMatrix(int64_t, int64_t);
33 |   DenseMatrix(const DenseMatrix&) = default;
34 |   DenseMatrix(DenseMatrix&&) noexcept;
35 |   DenseMatrix& operator=(const DenseMatrix&) = delete;
36 |   DenseMatrix& operator=(DenseMatrix&&) = delete;
37 |   virtual ~DenseMatrix() noexcept override = default;
38 | 
39 |   inline real* data() {
40 |     return data_.data();
41 |   }
42 |   inline const real* data() const {
43 |     return data_.data();
44 |   }
45 | 
46 |   inline const real& at(int64_t i, int64_t j) const {
47 |     assert(i * n_ + j < data_.size());
48 |     return data_[i * n_ + j];
49 |   };
50 |   inline real& at(int64_t i, int64_t j) {
51 |     return data_[i * n_ + j];
52 |   };
53 | 
54 |   inline int64_t rows() const {
55 |     return m_;
56 |   }
57 |   inline int64_t cols() const {
58 |     return n_;
59 |   }
60 |   void zero();
61 |   void uniform(real, unsigned int, int32_t);
62 | 
63 |   void multiplyRow(const Vector& nums, int64_t ib = 0, int64_t ie = -1);
64 |   void divideRow(const Vector& denoms, int64_t ib = 0, int64_t ie = -1);
65 | 
66 |   real l2NormRow(int64_t i) const;
67 |   void l2NormRow(Vector& norms) const;
68 | 
69 |   real dotRow(const Vector&, int64_t) const override;
70 |   void addVectorToRow(const Vector&, int64_t, real) override;
71 |   void addRowToVector(Vector& x, int32_t i) const override;
72 |   void addRowToVector(Vector& x, int32_t i, real a) const override;
73 |   void save(std::ostream&) const override;
74 |   void load(std::istream&) override;
75 |   void dump(std::ostream&) const override;
76 | 
77 |   class EncounteredNaNError : public std::runtime_error {
78 |    public:
79 |     EncounteredNaNError() : std::runtime_error("Encountered NaN.") {}
80 |   };
81 | };
82 | } // namespace fasttext
83 | 


--------------------------------------------------------------------------------
/src/fasttext/model.cc:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #include "model.h"
10 | #include "loss.h"
11 | #include "utils.h"
12 | 
13 | #include <algorithm>
14 | #include <stdexcept>
15 | 
16 | namespace fasttext {
17 | 
18 | Model::State::State(int32_t hiddenSize, int32_t outputSize, int32_t seed)
19 |     : lossValue_(0.0),
20 |       nexamples_(0),
21 |       hidden(hiddenSize),
22 |       output(outputSize),
23 |       grad(hiddenSize),
24 |       rng(seed) {}
25 | 
26 | real Model::State::getLoss() const {
27 |   return lossValue_ / nexamples_;
28 | }
29 | 
30 | void Model::State::incrementNExamples(real loss) {
31 |   lossValue_ += loss;
32 |   nexamples_++;
33 | }
34 | 
35 | Model::Model(
36 |     std::shared_ptr<Matrix> wi,
37 |     std::shared_ptr<Matrix> wo,
38 |     std::shared_ptr<Loss> loss,
39 |     bool normalizeGradient)
40 |     : wi_(wi), wo_(wo), loss_(loss), normalizeGradient_(normalizeGradient) {}
41 | 
42 | void Model::computeHidden(const std::vector<int32_t>& input, State& state)
43 |     const {
44 |   Vector& hidden = state.hidden;
45 |   hidden.zero();
46 |   for (auto it = input.cbegin(); it != input.cend(); ++it) {
47 |     hidden.addRow(*wi_, *it);
48 |   }
49 |   hidden.mul(1.0 / input.size());
50 | }
51 | 
52 | void Model::predict(
53 |     const std::vector<int32_t>& input,
54 |     int32_t k,
55 |     real threshold,
56 |     Predictions& heap,
57 |     State& state) const {
58 |   if (k == Model::kUnlimitedPredictions) {
59 |     k = wo_->size(0); // output size
60 |   } else if (k <= 0) {
61 |     throw std::invalid_argument("k needs to be 1 or higher!");
62 |   }
63 |   heap.reserve(k + 1);
64 |   computeHidden(input, state);
65 | 
66 |   loss_->predict(k, threshold, heap, state);
67 | }
68 | 
69 | void Model::update(
70 |     const std::vector<int32_t>& input,
71 |     const std::vector<int32_t>& targets,
72 |     int32_t targetIndex,
73 |     real lr,
74 |     State& state) {
75 |   if (input.size() == 0) {
76 |     return;
77 |   }
78 |   computeHidden(input, state);
79 | 
80 |   Vector& grad = state.grad;
81 |   grad.zero();
82 |   real lossValue = loss_->forward(targets, targetIndex, state, lr, true);
83 |   state.incrementNExamples(lossValue);
84 | 
85 |   if (normalizeGradient_) {
86 |     grad.mul(1.0 / input.size());
87 |   }
88 |   for (auto it = input.cbegin(); it != input.cend(); ++it) {
89 |     wi_->addVectorToRow(grad, *it, 1.0);
90 |   }
91 | }
92 | 
93 | real Model::std_log(real x) const {
94 |   return std::log(x + 1e-5);
95 | }
96 | 
97 | } // namespace fasttext
98 | 


--------------------------------------------------------------------------------
/src/fasttext/autotune.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the MIT license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <istream>
12 | #include <memory>
13 | #include <random>
14 | #include <thread>
15 | #include <vector>
16 | 
17 | #include "args.h"
18 | #include "fasttext.h"
19 | 
20 | namespace fasttext {
21 | 
22 | class AutotuneStrategy {
23 |  private:
24 |   Args bestArgs_;
25 |   int maxDuration_;
26 |   std::minstd_rand rng_;
27 |   int trials_;
28 |   int bestMinnIndex_;
29 |   int bestDsubExponent_;
30 |   int bestNonzeroBucket_;
31 |   std::vector<int> minnChoices_;
32 |   int getIndex(int val, const std::vector<int>& choices);
33 | 
34 |  public:
35 |   explicit AutotuneStrategy(
36 |       const Args& args,
37 |       std::minstd_rand::result_type seed);
38 |   Args ask(double elapsed);
39 |   void updateBest(const Args& args);
40 | };
41 | 
42 | class Autotune {
43 |  protected:
44 |   std::shared_ptr<FastText> fastText_;
45 |   double elapsed_;
46 |   double bestScore_;
47 |   int32_t trials_;
48 |   int32_t sizeConstraintFailed_;
49 |   std::atomic<bool> continueTraining_;
50 |   std::unique_ptr<AutotuneStrategy> strategy_;
51 |   std::thread timer_;
52 | 
53 |   bool keepTraining(double maxDuration) const;
54 |   void printInfo(double maxDuration);
55 |   void timer(
56 |       const std::chrono::steady_clock::time_point& start,
57 |       double maxDuration);
58 |   void abort();
59 |   void startTimer(const Args& args);
60 |   double getMetricScore(
61 |       Meter& meter,
62 |       const metric_name& metricName,
63 |       const std::string& metricLabel) const;
64 |   void printArgs(const Args& args, const Args& autotuneArgs);
65 |   void printSkippedArgs(const Args& autotuneArgs);
66 |   bool quantize(Args& args, const Args& autotuneArgs);
67 |   int getCutoffForFileSize(bool qout, bool qnorm, int dsub, int64_t fileSize)
68 |       const;
69 | 
70 |   class TimeoutError : public std::runtime_error {
71 |    public:
72 |     TimeoutError() : std::runtime_error("Autotune timed out.") {}
73 |   };
74 | 
75 |   static constexpr double kUnknownBestScore = -1.0;
76 |   static constexpr int kCutoffLimit = 256;
77 | 
78 |  public:
79 |   Autotune() = delete;
80 |   explicit Autotune(const std::shared_ptr<FastText>& fastText);
81 |   Autotune(const Autotune&) = delete;
82 |   Autotune(Autotune&&) = delete;
83 |   Autotune& operator=(const Autotune&) = delete;
84 |   Autotune& operator=(Autotune&&) = delete;
85 |   ~Autotune() noexcept = default;
86 | 
87 |   void train(const Args& args);
88 | };
89 | 
90 | } // namespace fasttext
91 | 


--------------------------------------------------------------------------------
/man/build_supervised.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/API.R
 3 | \name{build_supervised}
 4 | \alias{build_supervised}
 5 | \title{Build a supervised fasttext model}
 6 | \usage{
 7 | build_supervised(documents, targets, model_path, lr = 0.05, dim = 100,
 8 |   ws = 5, epoch = 5, minCount = 5, minCountLabel = 0, neg = 5,
 9 |   wordNgrams = 1, loss = c("ns", "hs", "softmax", "ova", "one-vs-all"),
10 |   bucket = 2e+06, minn = 3, maxn = 6, thread = 12,
11 |   lrUpdateRate = 100, t = 1e-04, label = "__label__", verbose = 2,
12 |   pretrainedVectors = NULL)
13 | }
14 | \arguments{
15 | \item{documents}{character vector of documents used for training}
16 | 
17 | \item{targets}{vector of targets/catagory of each document. Must have same length as \code{documents} and be coercable to character}
18 | 
19 | \item{model_path}{Name of output file \emph{without} file extension.}
20 | 
21 | \item{lr}{learning rate}
22 | 
23 | \item{dim}{size of word vectors}
24 | 
25 | \item{ws}{size of the context window}
26 | 
27 | \item{epoch}{number of epochs}
28 | 
29 | \item{minCount}{minimal number of word occurences}
30 | 
31 | \item{minCountLabel}{minimal number of label occurences}
32 | 
33 | \item{neg}{number of negatives sampled}
34 | 
35 | \item{wordNgrams}{max length of word ngram}
36 | 
37 | \item{loss}{= c('softmax', 'ns', 'hs', 'ova'), loss function {ns, hs, softmax, one Vs all}. one Vs all loss is usefull for multi class when you need to apply a threshold for each class score.}
38 | 
39 | \item{bucket}{number of buckets}
40 | 
41 | \item{minn}{min length of char ngram}
42 | 
43 | \item{maxn}{max length of char ngram}
44 | 
45 | \item{thread}{number of threads}
46 | 
47 | \item{lrUpdateRate}{change the rate of updates for the learning rate}
48 | 
49 | \item{t}{sampling threshold}
50 | 
51 | \item{label}{text string, labels prefix. Default is "\strong{label}"}
52 | 
53 | \item{verbose}{verbosity level}
54 | 
55 | \item{pretrainedVectors}{path to pretrained word vectors for supervised learning. Leave empty for no pretrained vectors.}
56 | }
57 | \value{
58 | path to new model file as a \code{character}
59 | }
60 | \description{
61 | Trains a supervised model, following the method layed out in
62 | \href{https://arxiv.org/abs/1607.01759}{Bag of Tricks for Efficient Text Classification}
63 | using the \href{https://fasttext.cc/}{fasttext} implementation.
64 | 
65 | See \href{https://fasttext.cc/docs/en/supervised-tutorial.html}{FastText text classification tutorial} for more information on
66 | training supervised models using fasttext.
67 | }
68 | \examples{
69 | \dontrun{
70 | library(fastrtext)
71 | model_file <- build_supervised(documents = train_sentences[["text"]],
72 |                                targets =train_sentences[["class.text"]],
73 |                                model_path = 'my_model',
74 |                                dim = 20, lr = 1, epoch = 20, wordNgrams = 2)
75 | 
76 | model <- load_model(model_file)
77 | 
78 | predictions <- predict(model, test_sentences[["text"]])
79 | mean(sapply(predictions, names) == test_sentences[["class.text"]])
80 | # ~0.8
81 | }
82 | }
83 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Version 0.3.4
 2 | 
 3 | fix deprecated error messages.
 4 | 
 5 | ## Version 0.3.3
 6 | small update
 7 | 
 8 | ## Version 0.3.2
 9 | Following Cran e-mail (from Prof Brian Ripley), remove strip-debug in Makevars file.
10 | Now the size of the package is > 10Mb and it generates a Warning regarding its size.
11 | 
12 | ```
13 | Please remove unconditional stripping ASAP and before Apr 24 to safely
14 | retain the package on CRAN.
15 | ```
16 | 
17 | ## Version 0.3.1
18 | Following cran response, tests have been shortened by using multithreading during model training (before use only 1 thread)
19 | 
20 | ## Version 0.3.0
21 | * big C++ code update
22 | * fix littl bug in catching C++ exception as reported by Cran server
23 | 
24 | I had to re-upload the package as bug in a corner case appeared.
25 | I am sorry for that.
26 | 
27 | ## Version 0.2.6
28 | * use -pthread flag for better Cran compliancy
29 | 
30 | ## Version 0.2.3 - 08.10.17 - answer
31 | Cran:
32 | Thanks, we see that the Date field is over a month old.
33 | Is this the right version?
34 | 
35 | Answer:
36 | This is the right version but I have made an error in the date.
37 | 
38 | 
39 | ## Version 0.2.3 - 08.10.17
40 | * fix Cran notes
41 | * fix a bug introduced in the last update
42 | 
43 | ## Version 0.2.2 - 06.10.17
44 | * add a dependency to fix some very specific crash due to Rcout called from different threads.
45 | * fix compilation on Mac OS with RcppThread
46 | 
47 | ## Version 0.2.1 - 18.09.17
48 | * Fix compilation crash on Mac OS
49 | * remove notes on R devel
50 | 
51 | ## Comments from Swetlana Herbrandt - 15.09.17 - 4:45PM (French time)
52 | * please omit the redundant 'R' in your title -> the R is now removed from the title field.
53 | * please write package names and software names in Title and Description in single quotes (e.g. 'FastText'). -> quotes have been applied on any software name.
54 | * please add an URL for 'FastText' in the form <http:...> or <https:...> with angle brackets for auto-linking and no space after 'http:' and 'https:' -> the link has been added at the end of the decsription text.
55 | * we see code lines such as  Copyright (c) 2016-present, Facebook, Inc. All rights reserved. Please add all authors and copyright holders in the Authors@R field with the appropriate roles. -> a new person Facebook, Inc. has been added, with the role cph
56 | 
57 | Note to Cran
58 | ------------
59 | The introduction of quotes (see above) has raised a new note:
60 | "The Description field should start with a capital letter."
61 | This is wanted.
62 | 
63 | 
64 | ## Test environments
65 | * Local ubuntu 17.04 + R version 3.4.1
66 | * R-Hub Cran check (Linux + Windows)
67 | * App veyor, Windows
68 | * Travis-CI, Linux
69 | 
70 | ## R CMD check results
71 | * Local: no warning, no note
72 | * Travis-CI: no warning, no note
73 | * R-Hub Cran check:
74 |   * 1 note: "Possibly mis-spelled words in DESCRIPTION" -> there is no error
75 | * App Veyor:
76 |   * 1 note: "Found no calls to: 'R_registerRoutines', 'R_useDynamicSymbols'" -> App Veyor tool chain is known to not be up to date, it may be the cause of this note. The note can't be reproduced on R-Hub Windows check.
77 | 


--------------------------------------------------------------------------------
/inst/include/fastrtext_RcppExports.h:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #ifndef RCPP_fastrtext_RCPPEXPORTS_H_GEN_
 5 | #define RCPP_fastrtext_RCPPEXPORTS_H_GEN_
 6 | 
 7 | #include <Rcpp.h>
 8 | 
 9 | namespace fastrtext {
10 | 
11 |     using namespace Rcpp;
12 | 
13 |     namespace {
14 |         void validateSignature(const char* sig) {
15 |             Rcpp::Function require = Rcpp::Environment::base_env()["require"];
16 |             require("fastrtext", Rcpp::Named("quietly") = true);
17 |             typedef int(*Ptr_validate)(const char*);
18 |             static Ptr_validate p_validate = (Ptr_validate)
19 |                 R_GetCCallable("fastrtext", "_fastrtext_RcppExport_validate");
20 |             if (!p_validate(sig)) {
21 |                 throw Rcpp::function_not_exported(
22 |                     "C++ function with signature '" + std::string(sig) + "' not found in fastrtext");
23 |             }
24 |         }
25 |     }
26 | 
27 |     inline CharacterVector add_prefix(const CharacterVector& texts, CharacterVector prefix) {
28 |         typedef SEXP(*Ptr_add_prefix)(SEXP,SEXP);
29 |         static Ptr_add_prefix p_add_prefix = NULL;
30 |         if (p_add_prefix == NULL) {
31 |             validateSignature("CharacterVector(*add_prefix)(const CharacterVector&,CharacterVector)");
32 |             p_add_prefix = (Ptr_add_prefix)R_GetCCallable("fastrtext", "_fastrtext_add_prefix");
33 |         }
34 |         RObject rcpp_result_gen;
35 |         {
36 |             RNGScope RCPP_rngScope_gen;
37 |             rcpp_result_gen = p_add_prefix(Shield<SEXP>(Rcpp::wrap(texts)), Shield<SEXP>(Rcpp::wrap(prefix)));
38 |         }
39 |         if (rcpp_result_gen.inherits("interrupted-error"))
40 |             throw Rcpp::internal::InterruptedException();
41 |         if (Rcpp::internal::isLongjumpSentinel(rcpp_result_gen))
42 |             throw Rcpp::LongjumpException(rcpp_result_gen);
43 |         if (rcpp_result_gen.inherits("try-error"))
44 |             throw Rcpp::exception(Rcpp::as<std::string>(rcpp_result_gen).c_str());
45 |         return Rcpp::as<CharacterVector >(rcpp_result_gen);
46 |     }
47 | 
48 |     inline std::string add_pr(const std::string& line, const std::string& prefix) {
49 |         typedef SEXP(*Ptr_add_pr)(SEXP,SEXP);
50 |         static Ptr_add_pr p_add_pr = NULL;
51 |         if (p_add_pr == NULL) {
52 |             validateSignature("std::string(*add_pr)(const std::string&,const std::string&)");
53 |             p_add_pr = (Ptr_add_pr)R_GetCCallable("fastrtext", "_fastrtext_add_pr");
54 |         }
55 |         RObject rcpp_result_gen;
56 |         {
57 |             RNGScope RCPP_rngScope_gen;
58 |             rcpp_result_gen = p_add_pr(Shield<SEXP>(Rcpp::wrap(line)), Shield<SEXP>(Rcpp::wrap(prefix)));
59 |         }
60 |         if (rcpp_result_gen.inherits("interrupted-error"))
61 |             throw Rcpp::internal::InterruptedException();
62 |         if (Rcpp::internal::isLongjumpSentinel(rcpp_result_gen))
63 |             throw Rcpp::LongjumpException(rcpp_result_gen);
64 |         if (rcpp_result_gen.inherits("try-error"))
65 |             throw Rcpp::exception(Rcpp::as<std::string>(rcpp_result_gen).c_str());
66 |         return Rcpp::as<std::string >(rcpp_result_gen);
67 |     }
68 | 
69 | }
70 | 
71 | #endif // RCPP_fastrtext_RCPPEXPORTS_H_GEN_
72 | 


--------------------------------------------------------------------------------
/tests/testthat/test-unsupervised.R:
--------------------------------------------------------------------------------
  1 | context("Unsupervised training")
  2 | 
  3 | model_test_path <- system.file("extdata",
  4 |                                "model_unsupervised_test.bin",
  5 |                                package = "fastrtext")
  6 | 
  7 | test_that("Training", {
  8 |   data("train_sentences")
  9 |   data("test_sentences")
 10 |   texts <- tolower(train_sentences[, "text"])
 11 |   tmp_file_txt <- tempfile()
 12 |   tmp_file_model <- tempfile()
 13 |   writeLines(text = texts, con = tmp_file_txt)
 14 |   execute(commands = c("skipgram",
 15 |                        "-input", tmp_file_txt,
 16 |                        "-output", tmp_file_model,
 17 |                        "-verbose", 0,
 18 |                        "-dim", 10,
 19 |                        "-bucket", 1e3,
 20 |                        "-loss", "ns",
 21 |                        "-epoch", 3))
 22 | 
 23 |   # Check learned file exists
 24 |   expect_true(file.exists(paste0(tmp_file_model, ".bin")))
 25 |   expect_true(file.exists(paste0(tmp_file_model, ".vec")))
 26 | 
 27 |   model <- load_model(tmp_file_model)
 28 |   parameters <- get_parameters(model)
 29 |   expect_equal(parameters$model_name, "sg")
 30 | 
 31 |   build_vectors(documents = texts,
 32 |                 model_path = tmp_file_model,
 33 |                 modeltype = "skipgram",
 34 |                 bucket = 1e3,
 35 |                 dim = 10,
 36 |                 epoch = 3,
 37 |                 loss = "softmax",
 38 |                 verbose = 0)
 39 | 
 40 | })
 41 | 
 42 | test_that("Test parameter extraction", {
 43 |   model <- load_model(model_test_path)
 44 |   parameters <- get_parameters(model)
 45 |   expect_equal(parameters$dim, 70)
 46 |   expect_equal(parameters$model_name, "sg")
 47 | })
 48 | 
 49 | test_that("Test word extraction and word IDs", {
 50 |   model <- load_model(model_test_path)
 51 |   dict <- get_dictionary(model)
 52 |   expect_length(dict, 2061)
 53 |   expect_true("time" %in% dict)
 54 |   expect_true("timing" %in% dict)
 55 |   expect_true("experience" %in% dict)
 56 |   expect_true("section" %in% dict)
 57 | 
 58 |   sentence_to_test <- c("this", "is", "a", "test")
 59 |   ids <- get_word_ids(model, sentence_to_test)
 60 |   expect_equal(get_dictionary(model)[ids], sentence_to_test)
 61 | })
 62 | 
 63 | test_that("Tokenization separate words in a text document", {
 64 |   model <- load_model(model_test_path)
 65 |   tokens <- get_tokenized_text(model, "this is a test")
 66 |   expect_equal(tokens, list(c("this", "is", "a", "test")))
 67 | })
 68 | 
 69 | test_that("Test word embeddings", {
 70 |   model <- load_model(model_test_path)
 71 | 
 72 |   # test vector lentgh
 73 |   parameters <- get_parameters(model)
 74 |   expect_length(get_word_vectors(model, "time")[1, ], parameters$dim)
 75 | 
 76 |   # test word distance
 77 |   expect_lt(get_word_distance(model, "introduction", "conclusions"),
 78 |             get_word_distance(model, "experience", "section"))
 79 |   expect_lt(get_word_distance(model, "our", "we"),
 80 |             get_word_distance(model, "introduction", "conclusions"))
 81 | })
 82 | 
 83 | test_that("Nearest neighbours", {
 84 |   model <- load_model(model_test_path)
 85 |   nn <- get_nn(model, "time", 10)
 86 |   expect_true("times" %in% names(nn))
 87 | })
 88 | 
 89 | test_that("Test sentence representation", {
 90 |   model <- load_model(model_test_path)
 91 |   m <- get_sentence_representation(model, "this is a test")
 92 |   expect_length(m, 70)
 93 |   expect_equal(nrow(m), 1)
 94 |   m <- get_sentence_representation(model, c("this is a test", "and here is another"))
 95 |   expect_equal(nrow(m), 2)
 96 |   expect_false(any(is.na(m)))
 97 | })
 98 | 
 99 | gc()
100 | 


--------------------------------------------------------------------------------
/src/fasttext/dictionary.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the MIT license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | #pragma once
 10 | 
 11 | #include <istream>
 12 | #include <memory>
 13 | #include <ostream>
 14 | #include <random>
 15 | #include <string>
 16 | #include <unordered_map>
 17 | #include <vector>
 18 | 
 19 | #include "args.h"
 20 | #include "real.h"
 21 | 
 22 | namespace fasttext {
 23 | 
 24 | typedef int32_t id_type;
 25 | enum class entry_type : int8_t { word = 0, label = 1 };
 26 | 
 27 | struct entry {
 28 |   std::string word;
 29 |   int64_t count;
 30 |   entry_type type;
 31 |   std::vector<int32_t> subwords;
 32 | };
 33 | 
 34 | class Dictionary {
 35 |  protected:
 36 |   static const int32_t MAX_VOCAB_SIZE = 30000000;
 37 |   static const int32_t MAX_LINE_SIZE = 1024;
 38 | 
 39 |   int32_t find(const std::string&) const;
 40 |   int32_t find(const std::string&, uint32_t h) const;
 41 |   void initTableDiscard();
 42 |   void initNgrams();
 43 |   void reset(std::istream&) const;
 44 |   void pushHash(std::vector<int32_t>&, int32_t) const;
 45 |   void addSubwords(std::vector<int32_t>&, const std::string&, int32_t) const;
 46 | 
 47 |   std::shared_ptr<Args> args_;
 48 |   std::vector<int32_t> word2int_;
 49 |   std::vector<entry> words_;
 50 | 
 51 |   std::vector<real> pdiscard_;
 52 |   int32_t size_;
 53 |   int32_t nwords_;
 54 |   int32_t nlabels_;
 55 |   int64_t ntokens_;
 56 | 
 57 |   int64_t pruneidx_size_;
 58 |   std::unordered_map<int32_t, int32_t> pruneidx_;
 59 |   void addWordNgrams(
 60 |       std::vector<int32_t>& line,
 61 |       const std::vector<int32_t>& hashes,
 62 |       int32_t n) const;
 63 | 
 64 |  public:
 65 |   static const std::string EOS;
 66 |   static const std::string BOW;
 67 |   static const std::string EOW;
 68 | 
 69 |   explicit Dictionary(std::shared_ptr<Args>);
 70 |   explicit Dictionary(std::shared_ptr<Args>, std::istream&);
 71 |   int32_t nwords() const;
 72 |   int32_t nlabels() const;
 73 |   int64_t ntokens() const;
 74 |   int32_t getId(const std::string&) const;
 75 |   int32_t getId(const std::string&, uint32_t h) const;
 76 |   entry_type getType(int32_t) const;
 77 |   entry_type getType(const std::string&) const;
 78 |   bool discard(int32_t, real) const;
 79 |   std::string getWord(int32_t) const;
 80 |   const std::vector<int32_t>& getSubwords(int32_t) const;
 81 |   const std::vector<int32_t> getSubwords(const std::string&) const;
 82 |   void getSubwords(
 83 |       const std::string&,
 84 |       std::vector<int32_t>&,
 85 |       std::vector<std::string>&) const;
 86 |   void computeSubwords(
 87 |       const std::string&,
 88 |       std::vector<int32_t>&,
 89 |       std::vector<std::string>* substrings = nullptr) const;
 90 |   uint32_t hash(const std::string& str) const;
 91 |   void add(const std::string&);
 92 |   bool readWord(std::istream&, std::string&) const;
 93 |   void readFromFile(std::istream&);
 94 |   std::string getLabel(int32_t) const;
 95 |   void save(std::ostream&) const;
 96 |   void load(std::istream&);
 97 |   std::vector<int64_t> getCounts(entry_type) const;
 98 |   int32_t getLine(std::istream&, std::vector<int32_t>&, std::vector<int32_t>&)
 99 |       const;
100 |   int32_t getLine(std::istream&, std::vector<int32_t>&, std::minstd_rand&)
101 |       const;
102 |   void threshold(int64_t, int64_t);
103 |   void prune(std::vector<int32_t>&);
104 |   bool isPruned() {
105 |     return pruneidx_size_ >= 0;
106 |   }
107 |   void dump(std::ostream&) const;
108 |   void init();
109 | };
110 | 
111 | } // namespace fasttext
112 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Sentence corpus - train part
 2 | #'
 3 | #' This corpus contains sentences from
 4 | #' the abstract and introduction of 30 scientific articles that have been
 5 | #' annotated (i.e. labeled or tagged) according to a modified version of the
 6 | #' Argumentative Zones annotation scheme.
 7 | #'
 8 | #' These 30 scientific articles come
 9 | #' from three different domains:
10 | #' 1. PLoS Computational Biology (PLOS)
11 | #' 2. The machine learning repository on arXiv (ARXIV)
12 | #' 3. The psychology journal Judgment and Decision Making (JDM)
13 | #'
14 | #' There are 10 articles from each domain. In addition to the labeled data, this
15 | #' corpus also contains a corresponding set of unlabeled articles. These unlabeled
16 | #' articles also come from PLOS, ARXIV, and JDM. There are 300 unlabeled articles
17 | #' from each domain (again, only the sentences from the abstract and
18 | #' introduction). These unlabeled articles can be used for unsupervised or
19 | #' semi-supervised approaches to sentence classification which rely on a small set
20 | #' of labeled data and a larger set of unlabeled data.
21 | #'
22 | #' ===== References =====
23 | #'
24 | #' S. Teufel and M. Moens. Summarizing scientific articles: experiments with
25 | #' relevance and rhetorical status. Computational Linguistics, 28(4):409-445,
26 | #' 2002.
27 | #'
28 | #' S. Teufel. Argumentative zoning: information extraction from scientific
29 | #' text. PhD thesis, School of Informatics, University of Edinburgh, 1999.
30 | #'
31 | #' @format 2 data frame with 3117 rows and 2 variables:
32 | #' \describe{
33 | #'   \item{text}{the sentences as a character vector}
34 | #'   \item{class.text}{the category of the sentence}
35 | #' }
36 | #' @source \url{https://archive.ics.uci.edu/ml/index.php}
37 | "train_sentences"
38 | 
39 | #' Sentence corpus - test part
40 | #'
41 | #' This corpus contains sentences from
42 | #' the abstract and introduction of 30 scientific articles that have been
43 | #' annotated (i.e. labeled or tagged) according to a modified version of the
44 | #' Argumentative Zones annotation scheme.
45 | #'
46 | #' These 30 scientific articles come
47 | #' from three different domains:
48 | #' 1. PLoS Computational Biology (PLOS)
49 | #' 2. The machine learning repository on arXiv (ARXIV)
50 | #' 3. The psychology journal Judgment and Decision Making (JDM)
51 | #'
52 | #' There are 10 articles from each domain. In addition to the labeled data, this
53 | #' corpus also contains a corresponding set of unlabeled articles. These unlabeled
54 | #' articles also come from PLOS, ARXIV, and JDM. There are 300 unlabeled articles
55 | #' from each domain (again, only the sentences from the abstract and
56 | #' introduction). These unlabeled articles can be used for unsupervised or
57 | #' semi-supervised approaches to sentence classification which rely on a small set
58 | #' of labeled data and a larger set of unlabeled data.
59 | #'
60 | #' ===== References =====
61 | #'
62 | #' S. Teufel and M. Moens. Summarizing scientific articles: experiments with
63 | #' relevance and rhetorical status. Computational Linguistics, 28(4):409-445,
64 | #' 2002.
65 | #'
66 | #' S. Teufel. Argumentative zoning: information extraction from scientific
67 | #' text. PhD thesis, School of Informatics, University of Edinburgh, 1999.
68 | #'
69 | #' @format 2 data frame with 3117 rows and 2 variables:
70 | #' \describe{
71 | #'   \item{text}{the sentences as a character vector}
72 | #'   \item{class.text}{the category of the sentence}
73 | #' }
74 | #' @source \url{https://archive.ics.uci.edu/ml/index.php}
75 | "test_sentences"
76 | 
77 | #' Stop words list
78 | #'
79 | #' List of words that can be safely removed from sentences.
80 | #'
81 | #' @format Character vector of stop words
82 | #' @source \url{https://archive.ics.uci.edu/ml/index.php}
83 | "stop_words_sentences"
84 | 


--------------------------------------------------------------------------------
/docs/pkgdown.js:
--------------------------------------------------------------------------------
  1 | /* http://gregfranko.com/blog/jquery-best-practices/ */
  2 | (function($) {
  3 |   $(function() {
  4 | 
  5 |     $("#sidebar")
  6 |       .stick_in_parent({offset_top: 40})
  7 |       .on('sticky_kit:bottom', function(e) {
  8 |         $(this).parent().css('position', 'static');
  9 |       })
 10 |       .on('sticky_kit:unbottom', function(e) {
 11 |         $(this).parent().css('position', 'relative');
 12 |       });
 13 | 
 14 |     $('body').scrollspy({
 15 |       target: '#sidebar',
 16 |       offset: 60
 17 |     });
 18 | 
 19 |     $('[data-toggle="tooltip"]').tooltip();
 20 | 
 21 |     var cur_path = paths(location.pathname);
 22 |     var links = $("#navbar ul li a");
 23 |     var max_length = -1;
 24 |     var pos = -1;
 25 |     for (var i = 0; i < links.length; i++) {
 26 |       if (links[i].getAttribute("href") === "#")
 27 |         continue;
 28 |       // Ignore external links
 29 |       if (links[i].host !== location.host)
 30 |         continue;
 31 | 
 32 |       var nav_path = paths(links[i].pathname);
 33 | 
 34 |       var length = prefix_length(nav_path, cur_path);
 35 |       if (length > max_length) {
 36 |         max_length = length;
 37 |         pos = i;
 38 |       }
 39 |     }
 40 | 
 41 |     // Add class to parent <li>, and enclosing <li> if in dropdown
 42 |     if (pos >= 0) {
 43 |       var menu_anchor = $(links[pos]);
 44 |       menu_anchor.parent().addClass("active");
 45 |       menu_anchor.closest("li.dropdown").addClass("active");
 46 |     }
 47 |   });
 48 | 
 49 |   function paths(pathname) {
 50 |     var pieces = pathname.split("/");
 51 |     pieces.shift(); // always starts with /
 52 | 
 53 |     var end = pieces[pieces.length - 1];
 54 |     if (end === "index.html" || end === "")
 55 |       pieces.pop();
 56 |     return(pieces);
 57 |   }
 58 | 
 59 |   // Returns -1 if not found
 60 |   function prefix_length(needle, haystack) {
 61 |     if (needle.length > haystack.length)
 62 |       return(-1);
 63 | 
 64 |     // Special case for length-0 haystack, since for loop won't run
 65 |     if (haystack.length === 0) {
 66 |       return(needle.length === 0 ? 0 : -1);
 67 |     }
 68 | 
 69 |     for (var i = 0; i < haystack.length; i++) {
 70 |       if (needle[i] != haystack[i])
 71 |         return(i);
 72 |     }
 73 | 
 74 |     return(haystack.length);
 75 |   }
 76 | 
 77 |   /* Clipboard --------------------------*/
 78 | 
 79 |   function changeTooltipMessage(element, msg) {
 80 |     var tooltipOriginalTitle=element.getAttribute('data-original-title');
 81 |     element.setAttribute('data-original-title', msg);
 82 |     $(element).tooltip('show');
 83 |     element.setAttribute('data-original-title', tooltipOriginalTitle);
 84 |   }
 85 | 
 86 |   if(ClipboardJS.isSupported()) {
 87 |     $(document).ready(function() {
 88 |       var copyButton = "<button type='button' class='btn btn-primary btn-copy-ex' type = 'submit' title='Copy to clipboard' aria-label='Copy to clipboard' data-toggle='tooltip' data-placement='left auto' data-trigger='hover' data-clipboard-copy><i class='fa fa-copy'></i></button>";
 89 | 
 90 |       $(".examples, div.sourceCode").addClass("hasCopyButton");
 91 | 
 92 |       // Insert copy buttons:
 93 |       $(copyButton).prependTo(".hasCopyButton");
 94 | 
 95 |       // Initialize tooltips:
 96 |       $('.btn-copy-ex').tooltip({container: 'body'});
 97 | 
 98 |       // Initialize clipboard:
 99 |       var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', {
100 |         text: function(trigger) {
101 |           return trigger.parentNode.textContent;
102 |         }
103 |       });
104 | 
105 |       clipboardBtnCopies.on('success', function(e) {
106 |         changeTooltipMessage(e.trigger, 'Copied!');
107 |         e.clearSelection();
108 |       });
109 | 
110 |       clipboardBtnCopies.on('error', function() {
111 |         changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy');
112 |       });
113 |     });
114 |   }
115 | })(window.jQuery || window.$)
116 | 


--------------------------------------------------------------------------------
/src/fasttext/quantmatrix.cc:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the MIT license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | #include "quantmatrix.h"
 10 | 
 11 | #include <assert.h>
 12 | #include <iostream>
 13 | #include <stdexcept>
 14 | 
 15 | namespace fasttext {
 16 | 
 17 | QuantMatrix::QuantMatrix() : Matrix(), qnorm_(false), codesize_(0) {}
 18 | 
 19 | QuantMatrix::QuantMatrix(DenseMatrix&& mat, int32_t dsub, bool qnorm)
 20 |     : Matrix(mat.size(0), mat.size(1)),
 21 |       qnorm_(qnorm),
 22 |       codesize_(mat.size(0) * ((mat.size(1) + dsub - 1) / dsub)) {
 23 |   codes_.resize(codesize_);
 24 |   pq_ = std::unique_ptr<ProductQuantizer>(new ProductQuantizer(n_, dsub));
 25 |   if (qnorm_) {
 26 |     norm_codes_.resize(m_);
 27 |     npq_ = std::unique_ptr<ProductQuantizer>(new ProductQuantizer(1, 1));
 28 |   }
 29 |   quantize(std::forward<DenseMatrix>(mat));
 30 | }
 31 | 
 32 | void QuantMatrix::quantizeNorm(const Vector& norms) {
 33 |   assert(qnorm_);
 34 |   assert(norms.size() == m_);
 35 |   auto dataptr = norms.data();
 36 |   npq_->train(m_, dataptr);
 37 |   npq_->compute_codes(dataptr, norm_codes_.data(), m_);
 38 | }
 39 | 
 40 | void QuantMatrix::quantize(DenseMatrix&& mat) {
 41 |   if (qnorm_) {
 42 |     Vector norms(mat.size(0));
 43 |     mat.l2NormRow(norms);
 44 |     mat.divideRow(norms);
 45 |     quantizeNorm(norms);
 46 |   }
 47 |   auto dataptr = mat.data();
 48 |   pq_->train(m_, dataptr);
 49 |   pq_->compute_codes(dataptr, codes_.data(), m_);
 50 | }
 51 | 
 52 | real QuantMatrix::dotRow(const Vector& vec, int64_t i) const {
 53 |   assert(i >= 0);
 54 |   assert(i < m_);
 55 |   assert(vec.size() == n_);
 56 |   real norm = 1;
 57 |   if (qnorm_) {
 58 |     norm = npq_->get_centroids(0, norm_codes_[i])[0];
 59 |   }
 60 |   return pq_->mulcode(vec, codes_.data(), i, norm);
 61 | }
 62 | 
 63 | void QuantMatrix::addVectorToRow(const Vector&, int64_t, real) {
 64 |   throw std::runtime_error("Operation not permitted on quantized matrices.");
 65 | }
 66 | 
 67 | void QuantMatrix::addRowToVector(Vector& x, int32_t i, real a) const {
 68 |   real norm = 1;
 69 |   if (qnorm_) {
 70 |     norm = npq_->get_centroids(0, norm_codes_[i])[0];
 71 |   }
 72 |   pq_->addcode(x, codes_.data(), i, a * norm);
 73 | }
 74 | 
 75 | void QuantMatrix::addRowToVector(Vector& x, int32_t i) const {
 76 |   real norm = 1;
 77 |   if (qnorm_) {
 78 |     norm = npq_->get_centroids(0, norm_codes_[i])[0];
 79 |   }
 80 |   pq_->addcode(x, codes_.data(), i, norm);
 81 | }
 82 | 
 83 | void QuantMatrix::save(std::ostream& out) const {
 84 |   out.write((char*)&qnorm_, sizeof(qnorm_));
 85 |   out.write((char*)&m_, sizeof(m_));
 86 |   out.write((char*)&n_, sizeof(n_));
 87 |   out.write((char*)&codesize_, sizeof(codesize_));
 88 |   out.write((char*)codes_.data(), codesize_ * sizeof(uint8_t));
 89 |   pq_->save(out);
 90 |   if (qnorm_) {
 91 |     out.write((char*)norm_codes_.data(), m_ * sizeof(uint8_t));
 92 |     npq_->save(out);
 93 |   }
 94 | }
 95 | 
 96 | void QuantMatrix::load(std::istream& in) {
 97 |   in.read((char*)&qnorm_, sizeof(qnorm_));
 98 |   in.read((char*)&m_, sizeof(m_));
 99 |   in.read((char*)&n_, sizeof(n_));
100 |   in.read((char*)&codesize_, sizeof(codesize_));
101 |   codes_ = std::vector<uint8_t>(codesize_);
102 |   in.read((char*)codes_.data(), codesize_ * sizeof(uint8_t));
103 |   pq_ = std::unique_ptr<ProductQuantizer>(new ProductQuantizer());
104 |   pq_->load(in);
105 |   if (qnorm_) {
106 |     norm_codes_ = std::vector<uint8_t>(m_);
107 |     in.read((char*)norm_codes_.data(), m_ * sizeof(uint8_t));
108 |     npq_ = std::unique_ptr<ProductQuantizer>(new ProductQuantizer());
109 |     npq_->load(in);
110 |   }
111 | }
112 | 
113 | void QuantMatrix::dump(std::ostream&) const {
114 |   throw std::runtime_error("Operation not permitted on quantized matrices.");
115 | }
116 | 
117 | } // namespace fasttext
118 | 


--------------------------------------------------------------------------------
/src/fasttext/loss.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the MIT license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | #pragma once
 10 | 
 11 | #include <memory>
 12 | #include <random>
 13 | #include <vector>
 14 | 
 15 | #include "matrix.h"
 16 | #include "model.h"
 17 | #include "real.h"
 18 | #include "utils.h"
 19 | #include "vector.h"
 20 | 
 21 | namespace fasttext {
 22 | 
 23 | class Loss {
 24 |  private:
 25 |   void findKBest(
 26 |       int32_t k,
 27 |       real threshold,
 28 |       Predictions& heap,
 29 |       const Vector& output) const;
 30 | 
 31 |  protected:
 32 |   std::vector<real> t_sigmoid_;
 33 |   std::vector<real> t_log_;
 34 |   std::shared_ptr<Matrix>& wo_;
 35 | 
 36 |   real log(real x) const;
 37 |   real sigmoid(real x) const;
 38 | 
 39 |  public:
 40 |   explicit Loss(std::shared_ptr<Matrix>& wo);
 41 |   virtual ~Loss() = default;
 42 | 
 43 |   virtual real forward(
 44 |       const std::vector<int32_t>& targets,
 45 |       int32_t targetIndex,
 46 |       Model::State& state,
 47 |       real lr,
 48 |       bool backprop) = 0;
 49 |   virtual void computeOutput(Model::State& state) const = 0;
 50 | 
 51 |   virtual void predict(
 52 |       int32_t /*k*/,
 53 |       real /*threshold*/,
 54 |       Predictions& /*heap*/,
 55 |       Model::State& /*state*/) const;
 56 | };
 57 | 
 58 | class BinaryLogisticLoss : public Loss {
 59 |  protected:
 60 |   real binaryLogistic(
 61 |       int32_t target,
 62 |       Model::State& state,
 63 |       bool labelIsPositive,
 64 |       real lr,
 65 |       bool backprop) const;
 66 | 
 67 |  public:
 68 |   explicit BinaryLogisticLoss(std::shared_ptr<Matrix>& wo);
 69 |   virtual ~BinaryLogisticLoss() noexcept override = default;
 70 |   void computeOutput(Model::State& state) const override;
 71 | };
 72 | 
 73 | class OneVsAllLoss : public BinaryLogisticLoss {
 74 |  public:
 75 |   explicit OneVsAllLoss(std::shared_ptr<Matrix>& wo);
 76 |   ~OneVsAllLoss() noexcept override = default;
 77 |   real forward(
 78 |       const std::vector<int32_t>& targets,
 79 |       int32_t targetIndex,
 80 |       Model::State& state,
 81 |       real lr,
 82 |       bool backprop) override;
 83 | };
 84 | 
 85 | class NegativeSamplingLoss : public BinaryLogisticLoss {
 86 |  protected:
 87 |   static const int32_t NEGATIVE_TABLE_SIZE = 10000000;
 88 | 
 89 |   int neg_;
 90 |   std::vector<int32_t> negatives_;
 91 |   std::uniform_int_distribution<size_t> uniform_;
 92 |   int32_t getNegative(int32_t target, std::minstd_rand& rng);
 93 | 
 94 |  public:
 95 |   explicit NegativeSamplingLoss(
 96 |       std::shared_ptr<Matrix>& wo,
 97 |       int neg,
 98 |       const std::vector<int64_t>& targetCounts);
 99 |   ~NegativeSamplingLoss() noexcept override = default;
100 | 
101 |   real forward(
102 |       const std::vector<int32_t>& targets,
103 |       int32_t targetIndex,
104 |       Model::State& state,
105 |       real lr,
106 |       bool backprop) override;
107 | };
108 | 
109 | class HierarchicalSoftmaxLoss : public BinaryLogisticLoss {
110 |  protected:
111 |   struct Node {
112 |     int32_t parent;
113 |     int32_t left;
114 |     int32_t right;
115 |     int64_t count;
116 |     bool binary;
117 |   };
118 | 
119 |   std::vector<std::vector<int32_t>> paths_;
120 |   std::vector<std::vector<bool>> codes_;
121 |   std::vector<Node> tree_;
122 |   int32_t osz_;
123 |   void buildTree(const std::vector<int64_t>& counts);
124 |   void dfs(
125 |       int32_t k,
126 |       real threshold,
127 |       int32_t node,
128 |       real score,
129 |       Predictions& heap,
130 |       const Vector& hidden) const;
131 | 
132 |  public:
133 |   explicit HierarchicalSoftmaxLoss(
134 |       std::shared_ptr<Matrix>& wo,
135 |       const std::vector<int64_t>& counts);
136 |   ~HierarchicalSoftmaxLoss() noexcept override = default;
137 |   real forward(
138 |       const std::vector<int32_t>& targets,
139 |       int32_t targetIndex,
140 |       Model::State& state,
141 |       real lr,
142 |       bool backprop) override;
143 |   void predict(
144 |       int32_t k,
145 |       real threshold,
146 |       Predictions& heap,
147 |       Model::State& state) const override;
148 | };
149 | 
150 | class SoftmaxLoss : public Loss {
151 |  public:
152 |   explicit SoftmaxLoss(std::shared_ptr<Matrix>& wo);
153 |   ~SoftmaxLoss() noexcept override = default;
154 |   real forward(
155 |       const std::vector<int32_t>& targets,
156 |       int32_t targetIndex,
157 |       Model::State& state,
158 |       real lr,
159 |       bool backprop) override;
160 |   void computeOutput(Model::State& state) const override;
161 | };
162 | 
163 | } // namespace fasttext
164 | 


--------------------------------------------------------------------------------
/src/fasttext/densematrix.cc:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the MIT license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | #include "densematrix.h"
 10 | 
 11 | #include <random>
 12 | #include <stdexcept>
 13 | #include <thread>
 14 | #include <utility>
 15 | #include "utils.h"
 16 | #include "vector.h"
 17 | 
 18 | namespace fasttext {
 19 | 
 20 | DenseMatrix::DenseMatrix() : DenseMatrix(0, 0) {}
 21 | 
 22 | DenseMatrix::DenseMatrix(int64_t m, int64_t n) : Matrix(m, n), data_(m * n) {}
 23 | 
 24 | DenseMatrix::DenseMatrix(DenseMatrix&& other) noexcept
 25 |     : Matrix(other.m_, other.n_), data_(std::move(other.data_)) {}
 26 | 
 27 | void DenseMatrix::zero() {
 28 |   std::fill(data_.begin(), data_.end(), 0.0);
 29 | }
 30 | 
 31 | void DenseMatrix::uniformThread(real a, int block, int32_t seed) {
 32 |   std::minstd_rand rng(block + seed);
 33 |   std::uniform_real_distribution<> uniform(-a, a);
 34 |   int64_t blockSize = (m_ * n_) / 10;
 35 |   for (int64_t i = blockSize * block;
 36 |        i < (m_ * n_) && i < blockSize * (block + 1);
 37 |        i++) {
 38 |     data_[i] = uniform(rng);
 39 |   }
 40 | }
 41 | 
 42 | void DenseMatrix::uniform(real a, unsigned int thread, int32_t seed) {
 43 |   std::vector<std::thread> threads;
 44 |   for (int i = 0; i < thread; i++) {
 45 |     threads.push_back(std::thread([=]() { uniformThread(a, i, seed); }));
 46 |   }
 47 |   for (int32_t i = 0; i < threads.size(); i++) {
 48 |     threads[i].join();
 49 |   }
 50 | }
 51 | 
 52 | void DenseMatrix::multiplyRow(const Vector& nums, int64_t ib, int64_t ie) {
 53 |   if (ie == -1) {
 54 |     ie = m_;
 55 |   }
 56 |   assert(ie <= nums.size());
 57 |   for (auto i = ib; i < ie; i++) {
 58 |     real n = nums[i - ib];
 59 |     if (n != 0) {
 60 |       for (auto j = 0; j < n_; j++) {
 61 |         at(i, j) *= n;
 62 |       }
 63 |     }
 64 |   }
 65 | }
 66 | 
 67 | void DenseMatrix::divideRow(const Vector& denoms, int64_t ib, int64_t ie) {
 68 |   if (ie == -1) {
 69 |     ie = m_;
 70 |   }
 71 |   assert(ie <= denoms.size());
 72 |   for (auto i = ib; i < ie; i++) {
 73 |     real n = denoms[i - ib];
 74 |     if (n != 0) {
 75 |       for (auto j = 0; j < n_; j++) {
 76 |         at(i, j) /= n;
 77 |       }
 78 |     }
 79 |   }
 80 | }
 81 | 
 82 | real DenseMatrix::l2NormRow(int64_t i) const {
 83 |   auto norm = 0.0;
 84 |   for (auto j = 0; j < n_; j++) {
 85 |     norm += at(i, j) * at(i, j);
 86 |   }
 87 |   if (std::isnan(norm)) {
 88 |     throw EncounteredNaNError();
 89 |   }
 90 |   return std::sqrt(norm);
 91 | }
 92 | 
 93 | void DenseMatrix::l2NormRow(Vector& norms) const {
 94 |   assert(norms.size() == m_);
 95 |   for (auto i = 0; i < m_; i++) {
 96 |     norms[i] = l2NormRow(i);
 97 |   }
 98 | }
 99 | 
100 | real DenseMatrix::dotRow(const Vector& vec, int64_t i) const {
101 |   assert(i >= 0);
102 |   assert(i < m_);
103 |   assert(vec.size() == n_);
104 |   real d = 0.0;
105 |   for (int64_t j = 0; j < n_; j++) {
106 |     d += at(i, j) * vec[j];
107 |   }
108 |   if (std::isnan(d)) {
109 |     throw EncounteredNaNError();
110 |   }
111 |   return d;
112 | }
113 | 
114 | void DenseMatrix::addVectorToRow(const Vector& vec, int64_t i, real a) {
115 |   assert(i >= 0);
116 |   assert(i < m_);
117 |   assert(vec.size() == n_);
118 |   for (int64_t j = 0; j < n_; j++) {
119 |     data_[i * n_ + j] += a * vec[j];
120 |   }
121 | }
122 | 
123 | void DenseMatrix::addRowToVector(Vector& x, int32_t i) const {
124 |   assert(i >= 0);
125 |   assert(i < this->size(0));
126 |   assert(x.size() == this->size(1));
127 |   for (int64_t j = 0; j < n_; j++) {
128 |     x[j] += at(i, j);
129 |   }
130 | }
131 | 
132 | void DenseMatrix::addRowToVector(Vector& x, int32_t i, real a) const {
133 |   assert(i >= 0);
134 |   assert(i < this->size(0));
135 |   assert(x.size() == this->size(1));
136 |   for (int64_t j = 0; j < n_; j++) {
137 |     x[j] += a * at(i, j);
138 |   }
139 | }
140 | 
141 | void DenseMatrix::save(std::ostream& out) const {
142 |   out.write((char*)&m_, sizeof(int64_t));
143 |   out.write((char*)&n_, sizeof(int64_t));
144 |   out.write((char*)data_.data(), m_ * n_ * sizeof(real));
145 | }
146 | 
147 | void DenseMatrix::load(std::istream& in) {
148 |   in.read((char*)&m_, sizeof(int64_t));
149 |   in.read((char*)&n_, sizeof(int64_t));
150 |   data_ = std::vector<real>(m_ * n_);
151 |   in.read((char*)data_.data(), m_ * n_ * sizeof(real));
152 | }
153 | 
154 | void DenseMatrix::dump(std::ostream& out) const {
155 |   out << m_ << " " << n_ << std::endl;
156 |   for (int64_t i = 0; i < m_; i++) {
157 |     for (int64_t j = 0; j < n_; j++) {
158 |       if (j > 0) {
159 |         out << " ";
160 |       }
161 |       out << at(i, j);
162 |     }
163 |     out << std::endl;
164 |   }
165 | };
166 | 
167 | } // namespace fasttext
168 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
  1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #include "../inst/include/fastrtext.h"
  5 | #include <Rcpp.h>
  6 | #include <string>
  7 | #include <set>
  8 | 
  9 | using namespace Rcpp;
 10 | 
 11 | // add_prefix
 12 | CharacterVector add_prefix(const CharacterVector& texts, CharacterVector prefix);
 13 | static SEXP _fastrtext_add_prefix_try(SEXP textsSEXP, SEXP prefixSEXP) {
 14 | BEGIN_RCPP
 15 |     Rcpp::RObject rcpp_result_gen;
 16 |     Rcpp::traits::input_parameter< const CharacterVector& >::type texts(textsSEXP);
 17 |     Rcpp::traits::input_parameter< CharacterVector >::type prefix(prefixSEXP);
 18 |     rcpp_result_gen = Rcpp::wrap(add_prefix(texts, prefix));
 19 |     return rcpp_result_gen;
 20 | END_RCPP_RETURN_ERROR
 21 | }
 22 | RcppExport SEXP _fastrtext_add_prefix(SEXP textsSEXP, SEXP prefixSEXP) {
 23 |     SEXP rcpp_result_gen;
 24 |     {
 25 |         Rcpp::RNGScope rcpp_rngScope_gen;
 26 |         rcpp_result_gen = PROTECT(_fastrtext_add_prefix_try(textsSEXP, prefixSEXP));
 27 |     }
 28 |     Rboolean rcpp_isInterrupt_gen = Rf_inherits(rcpp_result_gen, "interrupted-error");
 29 |     if (rcpp_isInterrupt_gen) {
 30 |         UNPROTECT(1);
 31 |         Rf_onintr();
 32 |     }
 33 |     bool rcpp_isLongjump_gen = Rcpp::internal::isLongjumpSentinel(rcpp_result_gen);
 34 |     if (rcpp_isLongjump_gen) {
 35 |         Rcpp::internal::resumeJump(rcpp_result_gen);
 36 |     }
 37 |     Rboolean rcpp_isError_gen = Rf_inherits(rcpp_result_gen, "try-error");
 38 |     if (rcpp_isError_gen) {
 39 |         SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen);
 40 |         UNPROTECT(1);
 41 |         Rf_error(CHAR(rcpp_msgSEXP_gen));
 42 |     }
 43 |     UNPROTECT(1);
 44 |     return rcpp_result_gen;
 45 | }
 46 | // add_pr
 47 | std::string add_pr(const std::string& line, const std::string& prefix);
 48 | static SEXP _fastrtext_add_pr_try(SEXP lineSEXP, SEXP prefixSEXP) {
 49 | BEGIN_RCPP
 50 |     Rcpp::RObject rcpp_result_gen;
 51 |     Rcpp::traits::input_parameter< const std::string& >::type line(lineSEXP);
 52 |     Rcpp::traits::input_parameter< const std::string& >::type prefix(prefixSEXP);
 53 |     rcpp_result_gen = Rcpp::wrap(add_pr(line, prefix));
 54 |     return rcpp_result_gen;
 55 | END_RCPP_RETURN_ERROR
 56 | }
 57 | RcppExport SEXP _fastrtext_add_pr(SEXP lineSEXP, SEXP prefixSEXP) {
 58 |     SEXP rcpp_result_gen;
 59 |     {
 60 |         Rcpp::RNGScope rcpp_rngScope_gen;
 61 |         rcpp_result_gen = PROTECT(_fastrtext_add_pr_try(lineSEXP, prefixSEXP));
 62 |     }
 63 |     Rboolean rcpp_isInterrupt_gen = Rf_inherits(rcpp_result_gen, "interrupted-error");
 64 |     if (rcpp_isInterrupt_gen) {
 65 |         UNPROTECT(1);
 66 |         Rf_onintr();
 67 |     }
 68 |     bool rcpp_isLongjump_gen = Rcpp::internal::isLongjumpSentinel(rcpp_result_gen);
 69 |     if (rcpp_isLongjump_gen) {
 70 |         Rcpp::internal::resumeJump(rcpp_result_gen);
 71 |     }
 72 |     Rboolean rcpp_isError_gen = Rf_inherits(rcpp_result_gen, "try-error");
 73 |     if (rcpp_isError_gen) {
 74 |         SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen);
 75 |         UNPROTECT(1);
 76 |         Rf_error(CHAR(rcpp_msgSEXP_gen));
 77 |     }
 78 |     UNPROTECT(1);
 79 |     return rcpp_result_gen;
 80 | }
 81 | 
 82 | // validate (ensure exported C++ functions exist before calling them)
 83 | static int _fastrtext_RcppExport_validate(const char* sig) { 
 84 |     static std::set<std::string> signatures;
 85 |     if (signatures.empty()) {
 86 |         signatures.insert("CharacterVector(*add_prefix)(const CharacterVector&,CharacterVector)");
 87 |         signatures.insert("std::string(*add_pr)(const std::string&,const std::string&)");
 88 |     }
 89 |     return signatures.find(sig) != signatures.end();
 90 | }
 91 | 
 92 | // registerCCallable (register entry points for exported C++ functions)
 93 | RcppExport SEXP _fastrtext_RcppExport_registerCCallable() { 
 94 |     R_RegisterCCallable("fastrtext", "_fastrtext_add_prefix", (DL_FUNC)_fastrtext_add_prefix_try);
 95 |     R_RegisterCCallable("fastrtext", "_fastrtext_add_pr", (DL_FUNC)_fastrtext_add_pr_try);
 96 |     R_RegisterCCallable("fastrtext", "_fastrtext_RcppExport_validate", (DL_FUNC)_fastrtext_RcppExport_validate);
 97 |     return R_NilValue;
 98 | }
 99 | 
100 | RcppExport SEXP _rcpp_module_boot_FASTRTEXT_MODULE();
101 | 
102 | static const R_CallMethodDef CallEntries[] = {
103 |     {"_fastrtext_add_prefix", (DL_FUNC) &_fastrtext_add_prefix, 2},
104 |     {"_fastrtext_add_pr", (DL_FUNC) &_fastrtext_add_pr, 2},
105 |     {"_rcpp_module_boot_FASTRTEXT_MODULE", (DL_FUNC) &_rcpp_module_boot_FASTRTEXT_MODULE, 0},
106 |     {"_fastrtext_RcppExport_registerCCallable", (DL_FUNC) &_fastrtext_RcppExport_registerCCallable, 0},
107 |     {NULL, NULL, 0}
108 | };
109 | 
110 | RcppExport void R_init_fastrtext(DllInfo *dll) {
111 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
112 |     R_useDynamicSymbols(dll, FALSE);
113 | }
114 | 


--------------------------------------------------------------------------------
/src/fasttext/fasttext.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the MIT license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | #pragma once
 10 | 
 11 | #include <time.h>
 12 | 
 13 | #include <atomic>
 14 | #include <chrono>
 15 | #include <iostream>
 16 | #include <memory>
 17 | #include <queue>
 18 | #include <set>
 19 | #include <tuple>
 20 | 
 21 | #include "args.h"
 22 | #include "densematrix.h"
 23 | #include "dictionary.h"
 24 | #include "matrix.h"
 25 | #include "meter.h"
 26 | #include "model.h"
 27 | #include "real.h"
 28 | #include "utils.h"
 29 | #include "vector.h"
 30 | 
 31 | namespace fasttext {
 32 | 
 33 | class FastText {
 34 |  protected:
 35 |   std::shared_ptr<Args> args_;
 36 |   std::shared_ptr<Dictionary> dict_;
 37 |   std::shared_ptr<Matrix> input_;
 38 |   std::shared_ptr<Matrix> output_;
 39 |   std::shared_ptr<Model> model_;
 40 |   std::atomic<int64_t> tokenCount_{};
 41 |   std::atomic<real> loss_{};
 42 |   std::chrono::steady_clock::time_point start_;
 43 |   bool quant_;
 44 |   int32_t version;
 45 |   std::unique_ptr<DenseMatrix> wordVectors_;
 46 |   std::exception_ptr trainException_;
 47 | 
 48 |   void signModel(std::ostream&);
 49 |   bool checkModel(std::istream&);
 50 |   void startThreads();
 51 |   void addInputVector(Vector&, int32_t) const;
 52 |   void trainThread(int32_t);
 53 |   std::vector<std::pair<real, std::string>> getNN(
 54 |       const DenseMatrix& wordVectors,
 55 |       const Vector& queryVec,
 56 |       int32_t k,
 57 |       const std::set<std::string>& banSet);
 58 |   void lazyComputeWordVectors();
 59 |   void printInfo(real, real, std::ostream&);
 60 |   std::shared_ptr<Matrix> getInputMatrixFromFile(const std::string&) const;
 61 |   std::shared_ptr<Matrix> createRandomMatrix() const;
 62 |   std::shared_ptr<Matrix> createTrainOutputMatrix() const;
 63 |   std::vector<int64_t> getTargetCounts() const;
 64 |   std::shared_ptr<Loss> createLoss(std::shared_ptr<Matrix>& output);
 65 |   void supervised(
 66 |       Model::State& state,
 67 |       real lr,
 68 |       const std::vector<int32_t>& line,
 69 |       const std::vector<int32_t>& labels);
 70 |   void cbow(Model::State& state, real lr, const std::vector<int32_t>& line);
 71 |   void skipgram(Model::State& state, real lr, const std::vector<int32_t>& line);
 72 |   std::vector<int32_t> selectEmbeddings(int32_t cutoff) const;
 73 |   void precomputeWordVectors(DenseMatrix& wordVectors);
 74 |   bool keepTraining(const int64_t ntokens) const;
 75 | 
 76 |  public:
 77 |   FastText();
 78 | 
 79 |   int32_t getWordId(const std::string& word) const;
 80 | 
 81 |   int32_t getSubwordId(const std::string& subword) const;
 82 | 
 83 |   void getWordVector(Vector& vec, const std::string& word) const;
 84 | 
 85 |   void getSubwordVector(Vector& vec, const std::string& subword) const;
 86 | 
 87 |   inline void getInputVector(Vector& vec, int32_t ind) {
 88 |     vec.zero();
 89 |     addInputVector(vec, ind);
 90 |   }
 91 | 
 92 |   const Args getArgs() const;
 93 | 
 94 |   std::shared_ptr<const Dictionary> getDictionary() const;
 95 | 
 96 |   std::shared_ptr<const DenseMatrix> getInputMatrix() const;
 97 | 
 98 |   std::shared_ptr<const DenseMatrix> getOutputMatrix() const;
 99 | 
100 |   void saveVectors(const std::string& filename);
101 | 
102 |   void saveModel(const std::string& filename);
103 | 
104 |   void saveOutput(const std::string& filename);
105 | 
106 |   void loadModel(std::istream& in);
107 | 
108 |   void loadModel(const std::string& filename);
109 | 
110 |   void getSentenceVector(std::istream& in, Vector& vec);
111 | 
112 |   void quantize(const Args& qargs);
113 | 
114 |   std::tuple<int64_t, double, double>
115 |   test(std::istream& in, int32_t k, real threshold = 0.0);
116 | 
117 |   void test(std::istream& in, int32_t k, real threshold, Meter& meter) const;
118 | 
119 |   void predict(
120 |       int32_t k,
121 |       const std::vector<int32_t>& words,
122 |       Predictions& predictions,
123 |       real threshold = 0.0) const;
124 | 
125 |   bool predictLine(
126 |       std::istream& in,
127 |       std::vector<std::pair<real, std::string>>& predictions,
128 |       int32_t k,
129 |       real threshold) const;
130 | 
131 |   std::vector<std::pair<std::string, Vector>> getNgramVectors(
132 |       const std::string& word) const;
133 | 
134 |   std::vector<std::pair<real, std::string>> getNN(
135 |       const std::string& word,
136 |       int32_t k);
137 | 
138 |   std::vector<std::pair<real, std::string>> getAnalogies(
139 |       int32_t k,
140 |       const std::string& wordA,
141 |       const std::string& wordB,
142 |       const std::string& wordC);
143 | 
144 |   void train(const Args& args);
145 | 
146 |   void abort();
147 | 
148 |   int getDimension() const;
149 | 
150 |   bool isQuant() const;
151 | 
152 |   class AbortError : public std::runtime_error {
153 |    public:
154 |     AbortError() : std::runtime_error("Aborted.") {}
155 |   };
156 | };
157 | } // namespace fasttext
158 | 


--------------------------------------------------------------------------------
/docs/LICENSE-text.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>License • fastrtext</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | 
 15 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
 16 | <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" integrity="sha256-eZrrJcwDc/3uDhsdt61sL2oOBY362qM3lon1gyExkL0=" crossorigin="anonymous" />
 20 | 
 21 | <!-- clipboard.js -->
 22 | <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script>
 23 | 
 24 | <!-- sticky kit -->
 25 | <script src="https://cdnjs.cloudflare.com/ajax/libs/sticky-kit/1.1.3/sticky-kit.min.js" integrity="sha256-c4Rlo1ZozqTPE2RLuvbusY3+SU1pQaJC0TjuhygMipw=" crossorigin="anonymous"></script>
 26 | 
 27 | <!-- pkgdown -->
 28 | <link href="pkgdown.css" rel="stylesheet">
 29 | <script src="pkgdown.js"></script>
 30 | 
 31 | 
 32 | 
 33 | <meta property="og:title" content="License" />
 34 | 
 35 | 
 36 | 
 37 | <!-- mathjax -->
 38 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
 39 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
 40 | 
 41 | <!--[if lt IE 9]>
 42 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 43 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 44 | <![endif]-->
 45 | 
 46 | 
 47 |   </head>
 48 | 
 49 |   <body>
 50 |     <div class="container template-title-body">
 51 |       <header>
 52 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 53 |   <div class="container">
 54 |     <div class="navbar-header">
 55 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 56 |         <span class="sr-only">Toggle navigation</span>
 57 |         <span class="icon-bar"></span>
 58 |         <span class="icon-bar"></span>
 59 |         <span class="icon-bar"></span>
 60 |       </button>
 61 |       <span class="navbar-brand">
 62 |         <a class="navbar-link" href="index.html">fastrtext</a>
 63 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.3.1</span>
 64 |       </span>
 65 |     </div>
 66 | 
 67 |     <div id="navbar" class="navbar-collapse collapse">
 68 |       <ul class="nav navbar-nav">
 69 |         <li>
 70 |   <a href="index.html">
 71 |     <span class="fa fa-home fa-lg"></span>
 72 |      
 73 |   </a>
 74 | </li>
 75 | <li>
 76 |   <a href="reference/index.html">Reference</a>
 77 | </li>
 78 | <li class="dropdown">
 79 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 80 |     Articles
 81 |      
 82 |     <span class="caret"></span>
 83 |   </a>
 84 |   <ul class="dropdown-menu" role="menu">
 85 |     <li>
 86 |       <a href="articles/list_commands.html">List of commands</a>
 87 |     </li>
 88 |     <li>
 89 |       <a href="articles/supervised_learning.html">Supervised learning</a>
 90 |     </li>
 91 |     <li>
 92 |       <a href="articles/unsupervised_learning.html">Unsupervised learning</a>
 93 |     </li>
 94 |   </ul>
 95 | </li>
 96 | <li>
 97 |   <a href="news/index.html">Changelog</a>
 98 | </li>
 99 |       </ul>
100 |       
101 |       <ul class="nav navbar-nav navbar-right">
102 |         <li>
103 |   <a href="https://github.com/pommedeterresautee/fastrtext">
104 |     <span class="fa fa-github fa-lg"></span>
105 |      
106 |   </a>
107 | </li>
108 |       </ul>
109 |       
110 |     </div><!--/.nav-collapse -->
111 |   </div><!--/.container -->
112 | </div><!--/.navbar -->
113 | 
114 |       
115 |       </header>
116 | 
117 | <div class="row">
118 |   <div class="contents col-md-9">
119 |     <div class="page-header">
120 |       <h1>License</h1>
121 |     </div>
122 | 
123 | <pre>YEAR: 2017
124 | COPYRIGHT HOLDER: Michaël Benesty
125 | </pre>
126 | 
127 |   </div>
128 | 
129 | </div>
130 | 
131 | 
132 |       <footer>
133 |       <div class="copyright">
134 |   <p>Developed by Michaël Benesty.</p>
135 | </div>
136 | 
137 | <div class="pkgdown">
138 |   <p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.3.0.</p>
139 | </div>
140 |       </footer>
141 |    </div>
142 | 
143 |   
144 | 
145 |   </body>
146 | </html>
147 | 
148 | 


--------------------------------------------------------------------------------
/docs/pkgdown.css:
--------------------------------------------------------------------------------
  1 | /* Sticky footer */
  2 | 
  3 | /**
  4 |  * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/
  5 |  * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css
  6 |  *
  7 |  * .Site -> body > .container
  8 |  * .Site-content -> body > .container .row
  9 |  * .footer -> footer
 10 |  *
 11 |  * Key idea seems to be to ensure that .container and __all its parents__
 12 |  * have height set to 100%
 13 |  *
 14 |  */
 15 | 
 16 | html, body {
 17 |   height: 100%;
 18 | }
 19 | 
 20 | body > .container {
 21 |   display: flex;
 22 |   height: 100%;
 23 |   flex-direction: column;
 24 | 
 25 |   padding-top: 60px;
 26 | }
 27 | 
 28 | body > .container .row {
 29 |   flex: 1 0 auto;
 30 | }
 31 | 
 32 | footer {
 33 |   margin-top: 45px;
 34 |   padding: 35px 0 36px;
 35 |   border-top: 1px solid #e5e5e5;
 36 |   color: #666;
 37 |   display: flex;
 38 |   flex-shrink: 0;
 39 | }
 40 | footer p {
 41 |   margin-bottom: 0;
 42 | }
 43 | footer div {
 44 |   flex: 1;
 45 | }
 46 | footer .pkgdown {
 47 |   text-align: right;
 48 | }
 49 | footer p {
 50 |   margin-bottom: 0;
 51 | }
 52 | 
 53 | img.icon {
 54 |   float: right;
 55 | }
 56 | 
 57 | img {
 58 |   max-width: 100%;
 59 | }
 60 | 
 61 | /* Fix bug in bootstrap (only seen in firefox) */
 62 | summary {
 63 |   display: list-item;
 64 | }
 65 | 
 66 | /* Typographic tweaking ---------------------------------*/
 67 | 
 68 | .contents .page-header {
 69 |   margin-top: calc(-60px + 1em);
 70 | }
 71 | 
 72 | /* Section anchors ---------------------------------*/
 73 | 
 74 | a.anchor {
 75 |   margin-left: -30px;
 76 |   display:inline-block;
 77 |   width: 30px;
 78 |   height: 30px;
 79 |   visibility: hidden;
 80 | 
 81 |   background-image: url(./link.svg);
 82 |   background-repeat: no-repeat;
 83 |   background-size: 20px 20px;
 84 |   background-position: center center;
 85 | }
 86 | 
 87 | .hasAnchor:hover a.anchor {
 88 |   visibility: visible;
 89 | }
 90 | 
 91 | @media (max-width: 767px) {
 92 |   .hasAnchor:hover a.anchor {
 93 |     visibility: hidden;
 94 |   }
 95 | }
 96 | 
 97 | 
 98 | /* Fixes for fixed navbar --------------------------*/
 99 | 
100 | .contents h1, .contents h2, .contents h3, .contents h4 {
101 |   padding-top: 60px;
102 |   margin-top: -40px;
103 | }
104 | 
105 | /* Static header placement on mobile devices */
106 | @media (max-width: 767px) {
107 |   .navbar-fixed-top {
108 |     position: absolute;
109 |   }
110 |   .navbar {
111 |     padding: 0;
112 |   }
113 | }
114 | 
115 | 
116 | /* Sidebar --------------------------*/
117 | 
118 | #sidebar {
119 |   margin-top: 30px;
120 | }
121 | #sidebar h2 {
122 |   font-size: 1.5em;
123 |   margin-top: 1em;
124 | }
125 | 
126 | #sidebar h2:first-child {
127 |   margin-top: 0;
128 | }
129 | 
130 | #sidebar .list-unstyled li {
131 |   margin-bottom: 0.5em;
132 | }
133 | 
134 | .orcid {
135 |   height: 16px;
136 |   vertical-align: middle;
137 | }
138 | 
139 | /* Reference index & topics ----------------------------------------------- */
140 | 
141 | .ref-index th {font-weight: normal;}
142 | 
143 | .ref-index td {vertical-align: top;}
144 | .ref-index .icon {width: 40px;}
145 | .ref-index .alias {width: 40%;}
146 | .ref-index-icons .alias {width: calc(40% - 40px);}
147 | .ref-index .title {width: 60%;}
148 | 
149 | .ref-arguments th {text-align: right; padding-right: 10px;}
150 | .ref-arguments th, .ref-arguments td {vertical-align: top;}
151 | .ref-arguments .name {width: 20%;}
152 | .ref-arguments .desc {width: 80%;}
153 | 
154 | /* Nice scrolling for wide elements --------------------------------------- */
155 | 
156 | table {
157 |   display: block;
158 |   overflow: auto;
159 | }
160 | 
161 | /* Syntax highlighting ---------------------------------------------------- */
162 | 
163 | pre {
164 |   word-wrap: normal;
165 |   word-break: normal;
166 |   border: 1px solid #eee;
167 | }
168 | 
169 | pre, code {
170 |   background-color: #f8f8f8;
171 |   color: #333;
172 | }
173 | 
174 | pre code {
175 |   overflow: auto;
176 |   word-wrap: normal;
177 |   white-space: pre;
178 | }
179 | 
180 | pre .img {
181 |   margin: 5px 0;
182 | }
183 | 
184 | pre .img img {
185 |   background-color: #fff;
186 |   display: block;
187 |   height: auto;
188 | }
189 | 
190 | code a, pre a {
191 |   color: #375f84;
192 | }
193 | 
194 | a.sourceLine:hover {
195 |   text-decoration: none;
196 | }
197 | 
198 | .fl      {color: #1514b5;}
199 | .fu      {color: #000000;} /* function */
200 | .ch,.st  {color: #036a07;} /* string */
201 | .kw      {color: #264D66;} /* keyword */
202 | .co      {color: #888888;} /* comment */
203 | 
204 | .message { color: black;   font-weight: bolder;}
205 | .error   { color: orange;  font-weight: bolder;}
206 | .warning { color: #6A0366; font-weight: bolder;}
207 | 
208 | /* Clipboard --------------------------*/
209 | 
210 | .hasCopyButton {
211 |   position: relative;
212 | }
213 | 
214 | .btn-copy-ex {
215 |   position: absolute;
216 |   right: 0;
217 |   top: 0;
218 |   visibility: hidden;
219 | }
220 | 
221 | .hasCopyButton:hover button.btn-copy-ex {
222 |   visibility: visible;
223 | }
224 | 
225 | /* mark.js ----------------------------*/
226 | 
227 | mark {
228 |   background-color: rgba(255, 255, 51, 0.5);
229 |   border-bottom: 2px solid rgba(255, 153, 51, 0.3);
230 |   padding: 1px;
231 | }
232 | 
233 | /* vertical spacing after htmlwidgets */
234 | .html-widget {
235 |   margin-bottom: 10px;
236 | }
237 | 


--------------------------------------------------------------------------------
/docs/authors.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Authors • fastrtext</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | 
 15 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
 16 | <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" integrity="sha256-eZrrJcwDc/3uDhsdt61sL2oOBY362qM3lon1gyExkL0=" crossorigin="anonymous" />
 20 | 
 21 | <!-- clipboard.js -->
 22 | <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script>
 23 | 
 24 | <!-- sticky kit -->
 25 | <script src="https://cdnjs.cloudflare.com/ajax/libs/sticky-kit/1.1.3/sticky-kit.min.js" integrity="sha256-c4Rlo1ZozqTPE2RLuvbusY3+SU1pQaJC0TjuhygMipw=" crossorigin="anonymous"></script>
 26 | 
 27 | <!-- pkgdown -->
 28 | <link href="pkgdown.css" rel="stylesheet">
 29 | <script src="pkgdown.js"></script>
 30 | 
 31 | 
 32 | 
 33 | <meta property="og:title" content="Authors" />
 34 | 
 35 | 
 36 | 
 37 | <!-- mathjax -->
 38 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
 39 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
 40 | 
 41 | <!--[if lt IE 9]>
 42 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 43 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 44 | <![endif]-->
 45 | 
 46 | 
 47 |   </head>
 48 | 
 49 |   <body>
 50 |     <div class="container template-authors">
 51 |       <header>
 52 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 53 |   <div class="container">
 54 |     <div class="navbar-header">
 55 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 56 |         <span class="sr-only">Toggle navigation</span>
 57 |         <span class="icon-bar"></span>
 58 |         <span class="icon-bar"></span>
 59 |         <span class="icon-bar"></span>
 60 |       </button>
 61 |       <span class="navbar-brand">
 62 |         <a class="navbar-link" href="index.html">fastrtext</a>
 63 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.3.1</span>
 64 |       </span>
 65 |     </div>
 66 | 
 67 |     <div id="navbar" class="navbar-collapse collapse">
 68 |       <ul class="nav navbar-nav">
 69 |         <li>
 70 |   <a href="index.html">
 71 |     <span class="fa fa-home fa-lg"></span>
 72 |      
 73 |   </a>
 74 | </li>
 75 | <li>
 76 |   <a href="reference/index.html">Reference</a>
 77 | </li>
 78 | <li class="dropdown">
 79 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 80 |     Articles
 81 |      
 82 |     <span class="caret"></span>
 83 |   </a>
 84 |   <ul class="dropdown-menu" role="menu">
 85 |     <li>
 86 |       <a href="articles/list_commands.html">List of commands</a>
 87 |     </li>
 88 |     <li>
 89 |       <a href="articles/supervised_learning.html">Supervised learning</a>
 90 |     </li>
 91 |     <li>
 92 |       <a href="articles/unsupervised_learning.html">Unsupervised learning</a>
 93 |     </li>
 94 |   </ul>
 95 | </li>
 96 | <li>
 97 |   <a href="news/index.html">Changelog</a>
 98 | </li>
 99 |       </ul>
100 |       
101 |       <ul class="nav navbar-nav navbar-right">
102 |         <li>
103 |   <a href="https://github.com/pommedeterresautee/fastrtext">
104 |     <span class="fa fa-github fa-lg"></span>
105 |      
106 |   </a>
107 | </li>
108 |       </ul>
109 |       
110 |     </div><!--/.nav-collapse -->
111 |   </div><!--/.container -->
112 | </div><!--/.navbar -->
113 | 
114 |       
115 |       </header>
116 | 
117 | <div class="row">
118 |   <div class="contents col-md-9">
119 |     <div class="page-header">
120 |       <h1>Authors</h1>
121 |     </div>
122 | 
123 |     <ul class="list-unstyled">
124 |       <li>
125 |         <p><strong>Michaël Benesty</strong>. Author, maintainer, copyright holder. 
126 |         </p>
127 |       </li>
128 |       <li>
129 |         <p><strong>Facebook, Inc</strong>. Copyright holder. 
130 |         </p>
131 |       </li>
132 |     </ul>
133 | 
134 |   </div>
135 | 
136 | </div>
137 | 
138 | 
139 |       <footer>
140 |       <div class="copyright">
141 |   <p>Developed by Michaël Benesty.</p>
142 | </div>
143 | 
144 | <div class="pkgdown">
145 |   <p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.3.0.</p>
146 | </div>
147 |       </footer>
148 |    </div>
149 | 
150 |   
151 | 
152 |   </body>
153 | </html>
154 | 
155 | 


--------------------------------------------------------------------------------
/docs/news/index.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Changelog • fastrtext</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | 
 15 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
 16 | <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" integrity="sha256-eZrrJcwDc/3uDhsdt61sL2oOBY362qM3lon1gyExkL0=" crossorigin="anonymous" />
 20 | 
 21 | <!-- clipboard.js -->
 22 | <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script>
 23 | 
 24 | <!-- sticky kit -->
 25 | <script src="https://cdnjs.cloudflare.com/ajax/libs/sticky-kit/1.1.3/sticky-kit.min.js" integrity="sha256-c4Rlo1ZozqTPE2RLuvbusY3+SU1pQaJC0TjuhygMipw=" crossorigin="anonymous"></script>
 26 | 
 27 | <!-- pkgdown -->
 28 | <link href="../pkgdown.css" rel="stylesheet">
 29 | <script src="../pkgdown.js"></script>
 30 | 
 31 | 
 32 | 
 33 | <meta property="og:title" content="Changelog" />
 34 | 
 35 | 
 36 | 
 37 | <!-- mathjax -->
 38 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
 39 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
 40 | 
 41 | <!--[if lt IE 9]>
 42 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 43 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 44 | <![endif]-->
 45 | 
 46 | 
 47 |   </head>
 48 | 
 49 |   <body>
 50 |     <div class="container template-news">
 51 |       <header>
 52 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 53 |   <div class="container">
 54 |     <div class="navbar-header">
 55 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 56 |         <span class="sr-only">Toggle navigation</span>
 57 |         <span class="icon-bar"></span>
 58 |         <span class="icon-bar"></span>
 59 |         <span class="icon-bar"></span>
 60 |       </button>
 61 |       <span class="navbar-brand">
 62 |         <a class="navbar-link" href="../index.html">fastrtext</a>
 63 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.3.1</span>
 64 |       </span>
 65 |     </div>
 66 | 
 67 |     <div id="navbar" class="navbar-collapse collapse">
 68 |       <ul class="nav navbar-nav">
 69 |         <li>
 70 |   <a href="../index.html">
 71 |     <span class="fa fa-home fa-lg"></span>
 72 |      
 73 |   </a>
 74 | </li>
 75 | <li>
 76 |   <a href="../reference/index.html">Reference</a>
 77 | </li>
 78 | <li class="dropdown">
 79 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 80 |     Articles
 81 |      
 82 |     <span class="caret"></span>
 83 |   </a>
 84 |   <ul class="dropdown-menu" role="menu">
 85 |     <li>
 86 |       <a href="../articles/list_commands.html">List of commands</a>
 87 |     </li>
 88 |     <li>
 89 |       <a href="../articles/supervised_learning.html">Supervised learning</a>
 90 |     </li>
 91 |     <li>
 92 |       <a href="../articles/unsupervised_learning.html">Unsupervised learning</a>
 93 |     </li>
 94 |   </ul>
 95 | </li>
 96 | <li>
 97 |   <a href="../news/index.html">Changelog</a>
 98 | </li>
 99 |       </ul>
100 |       
101 |       <ul class="nav navbar-nav navbar-right">
102 |         <li>
103 |   <a href="https://github.com/pommedeterresautee/fastrtext">
104 |     <span class="fa fa-github fa-lg"></span>
105 |      
106 |   </a>
107 | </li>
108 |       </ul>
109 |       
110 |     </div><!--/.nav-collapse -->
111 |   </div><!--/.container -->
112 | </div><!--/.navbar -->
113 | 
114 |       
115 |       </header>
116 | 
117 | <div class="row">
118 |   <div class="col-md-9 contents">
119 |     <div class="page-header">
120 |       <h1>Changelog <small></small></h1>
121 |       <small>Source: <a href='https://github.com/pommedeterresautee/fastrtext/blob/master/NEWS.md'><code>NEWS.md</code></a></small>
122 |     </div>
123 | 
124 |   </div>
125 | 
126 |   <div class="col-md-3 hidden-xs hidden-sm" id="sidebar">
127 |     <div id="tocnav">
128 |       <h2>Contents</h2>
129 |       <ul class="nav nav-pills nav-stacked">
130 |       </ul>
131 |     </div>
132 |   </div>
133 | 
134 | </div>
135 | 
136 |       <footer>
137 |       <div class="copyright">
138 |   <p>Developed by Michaël Benesty.</p>
139 | </div>
140 | 
141 | <div class="pkgdown">
142 |   <p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.3.0.</p>
143 | </div>
144 |       </footer>
145 |    </div>
146 | 
147 |   
148 | 
149 |   </body>
150 | </html>
151 | 
152 | 


--------------------------------------------------------------------------------
/docs/articles/index.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Articles • fastrtext</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | 
 15 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
 16 | <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" integrity="sha256-eZrrJcwDc/3uDhsdt61sL2oOBY362qM3lon1gyExkL0=" crossorigin="anonymous" />
 20 | 
 21 | <!-- clipboard.js -->
 22 | <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script>
 23 | 
 24 | <!-- sticky kit -->
 25 | <script src="https://cdnjs.cloudflare.com/ajax/libs/sticky-kit/1.1.3/sticky-kit.min.js" integrity="sha256-c4Rlo1ZozqTPE2RLuvbusY3+SU1pQaJC0TjuhygMipw=" crossorigin="anonymous"></script>
 26 | 
 27 | <!-- pkgdown -->
 28 | <link href="../pkgdown.css" rel="stylesheet">
 29 | <script src="../pkgdown.js"></script>
 30 | 
 31 | 
 32 | 
 33 | <meta property="og:title" content="Articles" />
 34 | 
 35 | 
 36 | 
 37 | <!-- mathjax -->
 38 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
 39 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
 40 | 
 41 | <!--[if lt IE 9]>
 42 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 43 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 44 | <![endif]-->
 45 | 
 46 | 
 47 |   </head>
 48 | 
 49 |   <body>
 50 |     <div class="container template-article-index">
 51 |       <header>
 52 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 53 |   <div class="container">
 54 |     <div class="navbar-header">
 55 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 56 |         <span class="sr-only">Toggle navigation</span>
 57 |         <span class="icon-bar"></span>
 58 |         <span class="icon-bar"></span>
 59 |         <span class="icon-bar"></span>
 60 |       </button>
 61 |       <span class="navbar-brand">
 62 |         <a class="navbar-link" href="../index.html">fastrtext</a>
 63 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.3.1</span>
 64 |       </span>
 65 |     </div>
 66 | 
 67 |     <div id="navbar" class="navbar-collapse collapse">
 68 |       <ul class="nav navbar-nav">
 69 |         <li>
 70 |   <a href="../index.html">
 71 |     <span class="fa fa-home fa-lg"></span>
 72 |      
 73 |   </a>
 74 | </li>
 75 | <li>
 76 |   <a href="../reference/index.html">Reference</a>
 77 | </li>
 78 | <li class="dropdown">
 79 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 80 |     Articles
 81 |      
 82 |     <span class="caret"></span>
 83 |   </a>
 84 |   <ul class="dropdown-menu" role="menu">
 85 |     <li>
 86 |       <a href="../articles/list_commands.html">List of commands</a>
 87 |     </li>
 88 |     <li>
 89 |       <a href="../articles/supervised_learning.html">Supervised learning</a>
 90 |     </li>
 91 |     <li>
 92 |       <a href="../articles/unsupervised_learning.html">Unsupervised learning</a>
 93 |     </li>
 94 |   </ul>
 95 | </li>
 96 | <li>
 97 |   <a href="../news/index.html">Changelog</a>
 98 | </li>
 99 |       </ul>
100 |       
101 |       <ul class="nav navbar-nav navbar-right">
102 |         <li>
103 |   <a href="https://github.com/pommedeterresautee/fastrtext">
104 |     <span class="fa fa-github fa-lg"></span>
105 |      
106 |   </a>
107 | </li>
108 |       </ul>
109 |       
110 |     </div><!--/.nav-collapse -->
111 |   </div><!--/.container -->
112 | </div><!--/.navbar -->
113 | 
114 |       
115 |       </header>
116 | 
117 | <div class="row">
118 |   <div class="col-md-9 contents">
119 |     <div class="page-header">
120 |       <h1>Articles</h1>
121 |     </div>
122 | 
123 |     <div class="section ">
124 |       <h3>All vignettes</h3>
125 |       <p class="section-desc"></p>
126 | 
127 |       <ul>
128 |         <li><a href="list_commands.html">List of commands</a></li>
129 |         <li><a href="supervised_learning.html">Supervised learning</a></li>
130 |         <li><a href="unsupervised_learning.html">Unsupervised learning</a></li>
131 |       </ul>
132 |     </div>
133 |   </div>
134 | </div>
135 | 
136 |       <footer>
137 |       <div class="copyright">
138 |   <p>Developed by Michaël Benesty.</p>
139 | </div>
140 | 
141 | <div class="pkgdown">
142 |   <p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.3.0.</p>
143 | </div>
144 |       </footer>
145 |    </div>
146 | 
147 |   
148 | 
149 |   </body>
150 | </html>
151 | 
152 | 


--------------------------------------------------------------------------------
/tests/testthat/test-supervised.R:
--------------------------------------------------------------------------------
  1 | context("Supervised training")
  2 | 
  3 | data("train_sentences")
  4 | data("test_sentences")
  5 | 
  6 | test_labels <- paste0("__label__", test_sentences[, "class.text"])
  7 | test_labels_without_prefix <- test_sentences[, "class.text"]
  8 | test_texts <- tolower(test_sentences[, "text"])
  9 | test_sentences_with_labels <- paste(test_labels, test_texts)
 10 | 
 11 | model_test_path <- system.file("extdata",
 12 |                                "model_classification_test.bin",
 13 |                                package = "fastrtext")
 14 | 
 15 | test_that("Training of a classification model", {
 16 |   # prepare data
 17 |   tmp_file_model <- tempfile()
 18 |   tmp_file_model_quantize <- tempfile()
 19 | 
 20 |   train_labels <- paste0("__label__", train_sentences[, "class.text"])
 21 |   train_texts <- tolower(train_sentences[, "text"])
 22 |   train_to_write <- paste(train_labels, train_texts)
 23 |   train_tmp_file_txt <- tempfile()
 24 |   writeLines(text = train_to_write, con = train_tmp_file_txt)
 25 | 
 26 |   # learn model
 27 |   execute(commands =
 28 |             c("supervised",
 29 |               "-input", train_tmp_file_txt,
 30 |               "-output", tmp_file_model,
 31 |               "-dim", 10,
 32 |               "-lr", 1,
 33 |               "-epoch", 10,
 34 |               "-bucket", 1e4,
 35 |               "-verbose", 0))
 36 | 
 37 |   # Check learned file exists
 38 |   expect_true(file.exists(paste0(tmp_file_model, ".bin")))
 39 | 
 40 |   learned_model <- load_model(tmp_file_model)
 41 |   learned_model_predictions <- predict(learned_model,
 42 |                                        sentences = test_sentences_with_labels)
 43 | 
 44 |   # Compare with embedded model
 45 |   embedded_model <- load_model(model_test_path)
 46 |   embedded_model_predictions <- predict(embedded_model,
 47 |                                         sentences = test_sentences_with_labels)
 48 |   expect_gt(mean(names(unlist(learned_model_predictions)) ==
 49 |                    names(unlist(embedded_model_predictions))), 0.75)
 50 | 
 51 |   build_supervised(documents = train_texts,
 52 |                    targets  = train_sentences[, "class.text"],
 53 |                    model_path = tmp_file_model,
 54 |                    dim = 10,
 55 |                    lr = 1,
 56 |                    epoch = 10,
 57 |                    bucket = 1e4,
 58 |                    verbose = 0)
 59 | 
 60 |   expect_true(file.exists(paste0(tmp_file_model, ".bin")))
 61 | 
 62 |   learned_model <- load_model(tmp_file_model)
 63 |   learned_model_predictions_bis <- predict(learned_model,
 64 |                                            sentences = test_sentences_with_labels)
 65 | 
 66 |   expect_gt(object = mean(names(unlist(learned_model_predictions)) == names(unlist(learned_model_predictions_bis))),
 67 |             expected = 0.75)
 68 | 
 69 |   # check with simplify = TRUE
 70 |   embedded_model_predictions_bis <- predict(embedded_model,
 71 |                                         sentences = test_sentences_with_labels,
 72 |                                         simplify = TRUE)
 73 |   expect_true(is.numeric(embedded_model_predictions_bis))
 74 |   expect_gt(mean(names(unlist(learned_model_predictions)) ==
 75 |                    names(embedded_model_predictions_bis)), 0.75)
 76 | 
 77 |   # Compare with quantize model
 78 |   # execute(commands = c("quantize",
 79 |   #                      "-output", tmp_file_model,
 80 |   #                      "-input", train_tmp_file_txt,
 81 |   #                      "-qnorm",
 82 |   #                      "-retrain",
 83 |   #                      "-epoch", 10,
 84 |   #                      "-cutoff", 100000))
 85 |   #
 86 |   # expect_true(file.exists(paste0(tmp_file_model, ".ftz")))
 87 |   # quantized_model <- load_model(paste0(tmp_file_model, ".ftz"))
 88 |   # quantized_model_predictions <- predict(quantized_model,
 89 |   #                                        sentences = test_sentences_with_labels)
 90 |   # expect_gt(mean(names(unlist(embedded_model_predictions_bis)) ==
 91 |   #                  names(unlist(quantized_model_predictions))), 0.75)
 92 | })
 93 | 
 94 | test_that("Test predictions", {
 95 |   model <- load_model(model_test_path)
 96 |   predictions <- predict(model, sentences = test_sentences_with_labels)
 97 | 
 98 |   # test measure (for 1 class, hamming == accuracy)
 99 |   expect_equal(get_hamming_loss(as.list(test_labels_without_prefix), predictions),
100 |                mean(sapply(predictions, names) == test_labels_without_prefix))
101 | 
102 |   expect_gt(get_hamming_loss(as.list(test_labels_without_prefix), predictions), 0.75)
103 | 
104 |   predictions <- predict(model, sentences = test_sentences_with_labels)
105 |   expect_length(predictions, 600)
106 |   expect_equal(unique(lengths(predictions)), 1)
107 |   expect_equal(unique(lengths(predict(model,
108 |                                       sentences = test_sentences_with_labels,
109 |                                       k = 2))), 2)
110 |   expect_gt(object = mean(sapply(predictions, names) == test_labels_without_prefix),
111 |             expected = 0.75)
112 | })
113 | 
114 | test_that("Test parameter extraction", {
115 |   model <- load_model(model_test_path)
116 |   parameters <- get_parameters(model)
117 |   expect_equal(parameters$model_name, "supervised")
118 | })
119 | 
120 | test_that("Test label extraction", {
121 |   model <- load_model(model_test_path)
122 |   labels_from_model <- get_labels(model)
123 |   expect_length(labels_from_model, 15)
124 | })
125 | 
126 | test_that("Test formating documents", {
127 |   tags <- list(c(1, 5), 0)
128 |   documents <- c("this is a text", "this is another document")
129 |   results <- add_tags(documents = documents, tags = tags)
130 |   expect_length(results, 2)
131 |   expect_equal(results[1], "__label__1 __label__5 this is a text")
132 | 
133 |   results <- add_tags(documents = documents, tags = c(0, 1))
134 |   expect_length(results, 2)
135 |   expect_equal(results[1], "__label__0 this is a text")
136 | })
137 | 
138 | gc()
139 | 


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
  1 | ![fastrtext](https://github.com/pommedeterresautee/fastrtext/raw/master/tools/logo.png) 
  2 | =========
  3 | 
  4 | [![Travis-CI Build Status](https://travis-ci.org/pommedeterresautee/fastrtext.svg?branch=master)](https://travis-ci.org/pommedeterresautee/fastrtext)
  5 | [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/pommedeterresautee/fastrtext?branch=master&svg=true)](https://ci.appveyor.com/project/pommedeterresautee/fastrtext)
  6 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/fastrtext)](https://cran.r-project.org/package=fastrtext)
  7 | [![CRAN_Download](http://cranlogs.r-pkg.org/badges/fastrtext)](http://cran.rstudio.com/web/packages/fastrtext/index.html) 
  8 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  9 | [![codecov](https://codecov.io/gh/pommedeterresautee/fastrtext/branch/master/graph/badge.svg)](https://codecov.io/gh/pommedeterresautee/fastrtext)
 10 | [![Follow](https://img.shields.io/twitter/follow/pommedeterre33.svg?style=social)](https://twitter.com/intent/follow?screen_name=pommedeterre33)
 11 | 
 12 | [R Documentation](https://pommedeterresautee.github.io/fastrtext/) | [Release Notes](https://github.com/pommedeterresautee/fastrtext/blob/master/NEWS.md) | [FAQ](https://fasttext.cc/docs/en/faqs.html) | [Multilingual pretrained models](https://fasttext.cc/docs/en/crawl-vectors.html)
 13 | 
 14 | R wrapper for [fastText](https://github.com/facebookresearch/fastText) C++ code from Facebook.
 15 | 
 16 | FastText is an open-source, free, lightweight library that allows users to learn text representations and text classifiers. It works on standard, generic hardware. Models can later be reduced in size to even fit on mobile devices.
 17 | 
 18 | 
 19 | License
 20 | -------
 21 | 
 22 | © Contributors, 2018. Licensed under a MIT license.
 23 | 
 24 | Installation
 25 | ------------
 26 | 
 27 | You can install the `fastrtext` package from Cran or Github as follows:
 28 | 
 29 | ```R
 30 | # From Cran
 31 | install.packages("fastrtext")
 32 | 
 33 | # From Github
 34 | # install.packages("devtools")
 35 | devtools::install_github("pommedeterresautee/fastrtext")
 36 | ```
 37 | 
 38 | Documentation
 39 | -------------
 40 | 
 41 | All the updated documentation can be reached at this [address](https://pommedeterresautee.github.io/fastrtext/).
 42 | 
 43 | API
 44 | ---
 45 | 
 46 | API documentation can be reached at this [address](https://pommedeterresautee.github.io/fastrtext/reference/index.html).
 47 | 
 48 | In particular, command line options are listed [there](https://pommedeterresautee.github.io/fastrtext/articles/list_command.html).
 49 | 
 50 | ### Supervised learning (text classification)
 51 | 
 52 | Data for a multi-class task are embedded in this package.  
 53 | Follow this [link](https://pommedeterresautee.github.io/fastrtext/articles/supervised_learning.html) to learn a model and then measure the accuracy in 5 minutes.  
 54 | 
 55 | 
 56 | ### Unsupervised learning (word representation)
 57 | 
 58 | Data for a word representation learning task are embedded in this package.  
 59 | Following this [link](https://pommedeterresautee.github.io/fastrtext/articles/unsupervised_learning.html) will route you to a 5mn tutorial to learn vectorial representation of words (aka word embeddings):  
 60 | 
 61 | Alternatives
 62 | ------------
 63 | 
 64 | Why not use the command line client?  
 65 | 
 66 | * You can call the client from the client using `system("fasttext ...")` ;
 67 | * To get prediction, you will need to write file, make predictions from the command line, then read the results ;
 68 | * `fastrtext` makes your life easier by making all these operations in memory ;
 69 | * It takes less time, and use less commands ;
 70 | * Easy to install from R directly.
 71 | 
 72 | Why not use [fastTextR](https://github.com/mlampros/fastTextR/) ?  
 73 | 
 74 | * `fastrtext` implements both supervised and unsupervised parts of `fastText` (`fastTextR` implements only the unsupervised part) ;
 75 | * with `fastrtext`, predictions can be done in memory (`fastTextR` requires to write the sentence on hard drive and requires you to read the predictions after) ;
 76 | * fastText original source code embedded in fastTextR is not up to date (miss several new features, bug fixes since January 2017).
 77 | 
 78 | References
 79 | ----------
 80 | 
 81 | Please cite [1](#enriching-word-vectors-with-subword-information) if using this code for learning word representations or [2](#bag-of-tricks-for-efficient-text-classification) if using for text classification.
 82 | 
 83 | ### Enriching Word Vectors with Subword Information
 84 | 
 85 | [1] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/abs/1607.04606)
 86 | 
 87 | ```
 88 | @article{bojanowski2016enriching,
 89 |   title={Enriching Word Vectors with Subword Information},
 90 |   author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
 91 |   journal={arXiv preprint arXiv:1607.04606},
 92 |   year={2016}
 93 | }
 94 | ```
 95 | 
 96 | ### Bag of Tricks for Efficient Text Classification
 97 | 
 98 | [2] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, [*Bag of Tricks for Efficient Text Classification*](https://arxiv.org/abs/1607.01759)
 99 | 
100 | ```
101 | @article{joulin2016bag,
102 |   title={Bag of Tricks for Efficient Text Classification},
103 |   author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
104 |   journal={arXiv preprint arXiv:1607.01759},
105 |   year={2016}
106 | }
107 | ```
108 | 
109 | ### FastText.zip: Compressing text classification models
110 | 
111 | [3] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, [*FastText.zip: Compressing text classification models*](https://arxiv.org/abs/1612.03651)
112 | 
113 | ```
114 | @article{joulin2016fasttext,
115 |   title={FastText.zip: Compressing text classification models},
116 |   author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
117 |   journal={arXiv preprint arXiv:1612.03651},
118 |   year={2016}
119 | }
120 | ```
121 | 
122 | (\* These authors contributed equally.)
123 | 


--------------------------------------------------------------------------------
/docs/reference/print_help.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Print help — print_help • fastrtext</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | 
 15 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
 16 | <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" integrity="sha256-eZrrJcwDc/3uDhsdt61sL2oOBY362qM3lon1gyExkL0=" crossorigin="anonymous" />
 20 | 
 21 | <!-- clipboard.js -->
 22 | <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script>
 23 | 
 24 | <!-- sticky kit -->
 25 | <script src="https://cdnjs.cloudflare.com/ajax/libs/sticky-kit/1.1.3/sticky-kit.min.js" integrity="sha256-c4Rlo1ZozqTPE2RLuvbusY3+SU1pQaJC0TjuhygMipw=" crossorigin="anonymous"></script>
 26 | 
 27 | <!-- pkgdown -->
 28 | <link href="../pkgdown.css" rel="stylesheet">
 29 | <script src="../pkgdown.js"></script>
 30 | 
 31 | 
 32 | 
 33 | <meta property="og:title" content="Print help — print_help" />
 34 | 
 35 | <meta property="og:description" content="Print command information, mainly to use with execute() function." />
 36 | <meta name="twitter:card" content="summary" />
 37 | 
 38 | 
 39 | 
 40 | <!-- mathjax -->
 41 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
 42 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
 43 | 
 44 | <!--[if lt IE 9]>
 45 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 46 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 47 | <![endif]-->
 48 | 
 49 | 
 50 |   </head>
 51 | 
 52 |   <body>
 53 |     <div class="container template-reference-topic">
 54 |       <header>
 55 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 56 |   <div class="container">
 57 |     <div class="navbar-header">
 58 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 59 |         <span class="sr-only">Toggle navigation</span>
 60 |         <span class="icon-bar"></span>
 61 |         <span class="icon-bar"></span>
 62 |         <span class="icon-bar"></span>
 63 |       </button>
 64 |       <span class="navbar-brand">
 65 |         <a class="navbar-link" href="../index.html">fastrtext</a>
 66 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.3.1</span>
 67 |       </span>
 68 |     </div>
 69 | 
 70 |     <div id="navbar" class="navbar-collapse collapse">
 71 |       <ul class="nav navbar-nav">
 72 |         <li>
 73 |   <a href="../index.html">
 74 |     <span class="fa fa-home fa-lg"></span>
 75 |      
 76 |   </a>
 77 | </li>
 78 | <li>
 79 |   <a href="../reference/index.html">Reference</a>
 80 | </li>
 81 | <li class="dropdown">
 82 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 83 |     Articles
 84 |      
 85 |     <span class="caret"></span>
 86 |   </a>
 87 |   <ul class="dropdown-menu" role="menu">
 88 |     <li>
 89 |       <a href="../articles/list_commands.html">List of commands</a>
 90 |     </li>
 91 |     <li>
 92 |       <a href="../articles/supervised_learning.html">Supervised learning</a>
 93 |     </li>
 94 |     <li>
 95 |       <a href="../articles/unsupervised_learning.html">Unsupervised learning</a>
 96 |     </li>
 97 |   </ul>
 98 | </li>
 99 | <li>
100 |   <a href="../news/index.html">Changelog</a>
101 | </li>
102 |       </ul>
103 |       
104 |       <ul class="nav navbar-nav navbar-right">
105 |         <li>
106 |   <a href="https://github.com/pommedeterresautee/fastrtext">
107 |     <span class="fa fa-github fa-lg"></span>
108 |      
109 |   </a>
110 | </li>
111 |       </ul>
112 |       
113 |     </div><!--/.nav-collapse -->
114 |   </div><!--/.container -->
115 | </div><!--/.navbar -->
116 | 
117 |       
118 |       </header>
119 | 
120 | <div class="row">
121 |   <div class="col-md-9 contents">
122 |     <div class="page-header">
123 |     <h1>Print help</h1>
124 |     <small class="dont-index">Source: <a href='https://github.com/pommedeterresautee/fastrtext/blob/master/R/API.R'><code>R/API.R</code></a></small>
125 |     <div class="hidden name"><code>print_help.Rd</code></div>
126 |     </div>
127 | 
128 |     <div class="ref-description">
129 |     
130 |     <p>Print command information, mainly to use with <code><a href='execute.html'>execute()</a></code> <code>function</code>.</p>
131 |     
132 |     </div>
133 | 
134 |     <pre class="usage"><span class='fu'>print_help</span>()</pre>
135 |         
136 | 
137 |     <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examples</h2>
138 |     <pre class="examples"><span class='co'># NOT RUN {</span>
139 | <span class='fu'>print_help</span>()
140 | <span class='co'># }</span><div class='input'>
141 | </div></pre>
142 |   </div>
143 |   <div class="col-md-3 hidden-xs hidden-sm" id="sidebar">
144 |     <h2>Contents</h2>
145 |     <ul class="nav nav-pills nav-stacked">
146 |             
147 |       <li><a href="#examples">Examples</a></li>
148 |     </ul>
149 | 
150 |   </div>
151 | </div>
152 | 
153 |       <footer>
154 |       <div class="copyright">
155 |   <p>Developed by Michaël Benesty.</p>
156 | </div>
157 | 
158 | <div class="pkgdown">
159 |   <p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.3.0.</p>
160 | </div>
161 |       </footer>
162 |    </div>
163 | 
164 |   
165 | 
166 |   </body>
167 | </html>
168 | 
169 | 


--------------------------------------------------------------------------------
/docs/reference/stop_words_sentences.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Stop words list — stop_words_sentences • fastrtext</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | 
 15 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
 16 | <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" integrity="sha256-eZrrJcwDc/3uDhsdt61sL2oOBY362qM3lon1gyExkL0=" crossorigin="anonymous" />
 20 | 
 21 | <!-- clipboard.js -->
 22 | <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script>
 23 | 
 24 | <!-- sticky kit -->
 25 | <script src="https://cdnjs.cloudflare.com/ajax/libs/sticky-kit/1.1.3/sticky-kit.min.js" integrity="sha256-c4Rlo1ZozqTPE2RLuvbusY3+SU1pQaJC0TjuhygMipw=" crossorigin="anonymous"></script>
 26 | 
 27 | <!-- pkgdown -->
 28 | <link href="../pkgdown.css" rel="stylesheet">
 29 | <script src="../pkgdown.js"></script>
 30 | 
 31 | 
 32 | 
 33 | <meta property="og:title" content="Stop words list — stop_words_sentences" />
 34 | 
 35 | <meta property="og:description" content="List of words that can be safely removed from sentences." />
 36 | <meta name="twitter:card" content="summary" />
 37 | 
 38 | 
 39 | 
 40 | <!-- mathjax -->
 41 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
 42 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
 43 | 
 44 | <!--[if lt IE 9]>
 45 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 46 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 47 | <![endif]-->
 48 | 
 49 | 
 50 |   </head>
 51 | 
 52 |   <body>
 53 |     <div class="container template-reference-topic">
 54 |       <header>
 55 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 56 |   <div class="container">
 57 |     <div class="navbar-header">
 58 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 59 |         <span class="sr-only">Toggle navigation</span>
 60 |         <span class="icon-bar"></span>
 61 |         <span class="icon-bar"></span>
 62 |         <span class="icon-bar"></span>
 63 |       </button>
 64 |       <span class="navbar-brand">
 65 |         <a class="navbar-link" href="../index.html">fastrtext</a>
 66 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.3.1</span>
 67 |       </span>
 68 |     </div>
 69 | 
 70 |     <div id="navbar" class="navbar-collapse collapse">
 71 |       <ul class="nav navbar-nav">
 72 |         <li>
 73 |   <a href="../index.html">
 74 |     <span class="fa fa-home fa-lg"></span>
 75 |      
 76 |   </a>
 77 | </li>
 78 | <li>
 79 |   <a href="../reference/index.html">Reference</a>
 80 | </li>
 81 | <li class="dropdown">
 82 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 83 |     Articles
 84 |      
 85 |     <span class="caret"></span>
 86 |   </a>
 87 |   <ul class="dropdown-menu" role="menu">
 88 |     <li>
 89 |       <a href="../articles/list_commands.html">List of commands</a>
 90 |     </li>
 91 |     <li>
 92 |       <a href="../articles/supervised_learning.html">Supervised learning</a>
 93 |     </li>
 94 |     <li>
 95 |       <a href="../articles/unsupervised_learning.html">Unsupervised learning</a>
 96 |     </li>
 97 |   </ul>
 98 | </li>
 99 | <li>
100 |   <a href="../news/index.html">Changelog</a>
101 | </li>
102 |       </ul>
103 |       
104 |       <ul class="nav navbar-nav navbar-right">
105 |         <li>
106 |   <a href="https://github.com/pommedeterresautee/fastrtext">
107 |     <span class="fa fa-github fa-lg"></span>
108 |      
109 |   </a>
110 | </li>
111 |       </ul>
112 |       
113 |     </div><!--/.nav-collapse -->
114 |   </div><!--/.container -->
115 | </div><!--/.navbar -->
116 | 
117 |       
118 |       </header>
119 | 
120 | <div class="row">
121 |   <div class="col-md-9 contents">
122 |     <div class="page-header">
123 |     <h1>Stop words list</h1>
124 |     <small class="dont-index">Source: <a href='https://github.com/pommedeterresautee/fastrtext/blob/master/R/data.R'><code>R/data.R</code></a></small>
125 |     <div class="hidden name"><code>stop_words_sentences.Rd</code></div>
126 |     </div>
127 | 
128 |     <div class="ref-description">
129 |     
130 |     <p>List of words that can be safely removed from sentences.</p>
131 |     
132 |     </div>
133 | 
134 |     <pre class="usage"><span class='no'>stop_words_sentences</span></pre>
135 |         
136 |     <h2 class="hasAnchor" id="format"><a class="anchor" href="#format"></a>Format</h2>
137 | 
138 |     <p>Character vector of stop words</p>
139 |     
140 |     <h2 class="hasAnchor" id="source"><a class="anchor" href="#source"></a>Source</h2>
141 | 
142 |     <p><a href='https://archive.ics.uci.edu/ml/datasets.html?format=&amp;task=&amp;att=&amp;area=&amp;numAtt=&amp;numIns=&amp;type=text&amp;sort=nameUp&amp;view=table'>https://archive.ics.uci.edu/ml/datasets.html?format=&amp;task=&amp;att=&amp;area=&amp;numAtt=&amp;numIns=&amp;type=text&amp;sort=nameUp&amp;view=table</a></p>
143 |     
144 | 
145 |   </div>
146 |   <div class="col-md-3 hidden-xs hidden-sm" id="sidebar">
147 |     <h2>Contents</h2>
148 |     <ul class="nav nav-pills nav-stacked">
149 |       
150 |       <li><a href="#format">Format</a></li>
151 | 
152 |       <li><a href="#source">Source</a></li>
153 |           </ul>
154 | 
155 |   </div>
156 | </div>
157 | 
158 |       <footer>
159 |       <div class="copyright">
160 |   <p>Developed by Michaël Benesty.</p>
161 | </div>
162 | 
163 | <div class="pkgdown">
164 |   <p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.3.0.</p>
165 | </div>
166 |       </footer>
167 |    </div>
168 | 
169 |   
170 | 
171 |   </body>
172 | </html>
173 | 
174 | 


--------------------------------------------------------------------------------
/docs/reference/Rcpp_fastrtext-class.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Rcpp_fastrtext class — Rcpp_fastrtext-class • fastrtext</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | 
 15 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
 16 | <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" integrity="sha256-eZrrJcwDc/3uDhsdt61sL2oOBY362qM3lon1gyExkL0=" crossorigin="anonymous" />
 20 | 
 21 | <!-- clipboard.js -->
 22 | <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script>
 23 | 
 24 | <!-- sticky kit -->
 25 | <script src="https://cdnjs.cloudflare.com/ajax/libs/sticky-kit/1.1.3/sticky-kit.min.js" integrity="sha256-c4Rlo1ZozqTPE2RLuvbusY3+SU1pQaJC0TjuhygMipw=" crossorigin="anonymous"></script>
 26 | 
 27 | <!-- pkgdown -->
 28 | <link href="../pkgdown.css" rel="stylesheet">
 29 | <script src="../pkgdown.js"></script>
 30 | 
 31 | 
 32 | 
 33 | <meta property="og:title" content="Rcpp_fastrtext class — Rcpp_fastrtext-class" />
 34 | 
 35 | <meta property="og:description" content="Models are S4 objects with several slots (methods) which can be called that way: model$slot_name()" />
 36 | <meta name="twitter:card" content="summary" />
 37 | 
 38 | 
 39 | 
 40 | <!-- mathjax -->
 41 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
 42 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
 43 | 
 44 | <!--[if lt IE 9]>
 45 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 46 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 47 | <![endif]-->
 48 | 
 49 | 
 50 |   </head>
 51 | 
 52 |   <body>
 53 |     <div class="container template-reference-topic">
 54 |       <header>
 55 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 56 |   <div class="container">
 57 |     <div class="navbar-header">
 58 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 59 |         <span class="sr-only">Toggle navigation</span>
 60 |         <span class="icon-bar"></span>
 61 |         <span class="icon-bar"></span>
 62 |         <span class="icon-bar"></span>
 63 |       </button>
 64 |       <span class="navbar-brand">
 65 |         <a class="navbar-link" href="../index.html">fastrtext</a>
 66 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.3.1</span>
 67 |       </span>
 68 |     </div>
 69 | 
 70 |     <div id="navbar" class="navbar-collapse collapse">
 71 |       <ul class="nav navbar-nav">
 72 |         <li>
 73 |   <a href="../index.html">
 74 |     <span class="fa fa-home fa-lg"></span>
 75 |      
 76 |   </a>
 77 | </li>
 78 | <li>
 79 |   <a href="../reference/index.html">Reference</a>
 80 | </li>
 81 | <li class="dropdown">
 82 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 83 |     Articles
 84 |      
 85 |     <span class="caret"></span>
 86 |   </a>
 87 |   <ul class="dropdown-menu" role="menu">
 88 |     <li>
 89 |       <a href="../articles/list_commands.html">List of commands</a>
 90 |     </li>
 91 |     <li>
 92 |       <a href="../articles/supervised_learning.html">Supervised learning</a>
 93 |     </li>
 94 |     <li>
 95 |       <a href="../articles/unsupervised_learning.html">Unsupervised learning</a>
 96 |     </li>
 97 |   </ul>
 98 | </li>
 99 | <li>
100 |   <a href="../news/index.html">Changelog</a>
101 | </li>
102 |       </ul>
103 |       
104 |       <ul class="nav navbar-nav navbar-right">
105 |         <li>
106 |   <a href="https://github.com/pommedeterresautee/fastrtext">
107 |     <span class="fa fa-github fa-lg"></span>
108 |      
109 |   </a>
110 | </li>
111 |       </ul>
112 |       
113 |     </div><!--/.nav-collapse -->
114 |   </div><!--/.container -->
115 | </div><!--/.navbar -->
116 | 
117 |       
118 |       </header>
119 | 
120 | <div class="row">
121 |   <div class="col-md-9 contents">
122 |     <div class="page-header">
123 |     <h1>Rcpp_fastrtext class</h1>
124 |     <small class="dont-index">Source: <a href='https://github.com/pommedeterresautee/fastrtext/blob/master/R/zzz.R'><code>R/zzz.R</code></a></small>
125 |     <div class="hidden name"><code>Rcpp_fastrtext-class.Rd</code></div>
126 |     </div>
127 | 
128 |     <div class="ref-description">
129 |     
130 |     <p>Models are <a href='https://www.rdocumentation.org/packages/base/topics/isS4'>S4</a> objects with several slots (methods) which can be called that way: model$slot_name()</p>
131 |     
132 |     </div>
133 | 
134 |         
135 |     <h2 class="hasAnchor" id="slots"><a class="anchor" href="#slots"></a>Slots</h2>
136 | 
137 |     
138 |     <dl class='dl-horizontal'>
139 | <dt><code>load</code></dt><dd><p>Load a model</p></dd>
140 |     <dt><code>predict</code></dt><dd><p>Make a prediction</p></dd>
141 |     <dt><code>execute</code></dt><dd><p>Execute commands</p></dd>
142 |     <dt><code>get_vectors</code></dt><dd><p>Get vectors related to provided words</p></dd>
143 |     <dt><code>get_parameters</code></dt><dd><p>Get parameters used to train the model</p></dd>
144 |     <dt><code>get_dictionary</code></dt><dd><p>List all words learned</p></dd>
145 |     <dt><code>get_labels</code></dt><dd><p>List all labels learned</p></dd>
146 | </dl>
147 |     
148 | 
149 |   </div>
150 |   <div class="col-md-3 hidden-xs hidden-sm" id="sidebar">
151 |     <h2>Contents</h2>
152 |     <ul class="nav nav-pills nav-stacked">
153 |       
154 |       <li><a href="#slots">Slots</a></li>
155 |           </ul>
156 | 
157 |   </div>
158 | </div>
159 | 
160 |       <footer>
161 |       <div class="copyright">
162 |   <p>Developed by Michaël Benesty.</p>
163 | </div>
164 | 
165 | <div class="pkgdown">
166 |   <p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.3.0.</p>
167 | </div>
168 |       </footer>
169 |    </div>
170 | 
171 |   
172 | 
173 |   </body>
174 | </html>
175 | 
176 | 


--------------------------------------------------------------------------------
/src/fasttext/productquantizer.cc:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2016-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the MIT license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | #include "productquantizer.h"
 10 | 
 11 | #include <algorithm>
 12 | #include <iostream>
 13 | #include <numeric>
 14 | #include <stdexcept>
 15 | #include <string>
 16 | 
 17 | namespace fasttext {
 18 | 
 19 | real distL2(const real* x, const real* y, int32_t d) {
 20 |   real dist = 0;
 21 |   for (auto i = 0; i < d; i++) {
 22 |     auto tmp = x[i] - y[i];
 23 |     dist += tmp * tmp;
 24 |   }
 25 |   return dist;
 26 | }
 27 | 
 28 | ProductQuantizer::ProductQuantizer(int32_t dim, int32_t dsub)
 29 |     : dim_(dim),
 30 |       nsubq_(dim / dsub),
 31 |       dsub_(dsub),
 32 |       centroids_(dim * ksub_),
 33 |       rng(seed_) {
 34 |   lastdsub_ = dim_ % dsub;
 35 |   if (lastdsub_ == 0) {
 36 |     lastdsub_ = dsub_;
 37 |   } else {
 38 |     nsubq_++;
 39 |   }
 40 | }
 41 | 
 42 | const real* ProductQuantizer::get_centroids(int32_t m, uint8_t i) const {
 43 |   if (m == nsubq_ - 1) {
 44 |     return &centroids_[m * ksub_ * dsub_ + i * lastdsub_];
 45 |   }
 46 |   return &centroids_[(m * ksub_ + i) * dsub_];
 47 | }
 48 | 
 49 | real* ProductQuantizer::get_centroids(int32_t m, uint8_t i) {
 50 |   if (m == nsubq_ - 1) {
 51 |     return &centroids_[m * ksub_ * dsub_ + i * lastdsub_];
 52 |   }
 53 |   return &centroids_[(m * ksub_ + i) * dsub_];
 54 | }
 55 | 
 56 | real ProductQuantizer::assign_centroid(
 57 |     const real* x,
 58 |     const real* c0,
 59 |     uint8_t* code,
 60 |     int32_t d) const {
 61 |   const real* c = c0;
 62 |   real dis = distL2(x, c, d);
 63 |   code[0] = 0;
 64 |   for (auto j = 1; j < ksub_; j++) {
 65 |     c += d;
 66 |     real disij = distL2(x, c, d);
 67 |     if (disij < dis) {
 68 |       code[0] = (uint8_t)j;
 69 |       dis = disij;
 70 |     }
 71 |   }
 72 |   return dis;
 73 | }
 74 | 
 75 | void ProductQuantizer::Estep(
 76 |     const real* x,
 77 |     const real* centroids,
 78 |     uint8_t* codes,
 79 |     int32_t d,
 80 |     int32_t n) const {
 81 |   for (auto i = 0; i < n; i++) {
 82 |     assign_centroid(x + i * d, centroids, codes + i, d);
 83 |   }
 84 | }
 85 | 
 86 | void ProductQuantizer::MStep(
 87 |     const real* x0,
 88 |     real* centroids,
 89 |     const uint8_t* codes,
 90 |     int32_t d,
 91 |     int32_t n) {
 92 |   std::vector<int32_t> nelts(ksub_, 0);
 93 |   memset(centroids, 0, sizeof(real) * d * ksub_);
 94 |   const real* x = x0;
 95 |   for (auto i = 0; i < n; i++) {
 96 |     auto k = codes[i];
 97 |     real* c = centroids + k * d;
 98 |     for (auto j = 0; j < d; j++) {
 99 |       c[j] += x[j];
100 |     }
101 |     nelts[k]++;
102 |     x += d;
103 |   }
104 | 
105 |   real* c = centroids;
106 |   for (auto k = 0; k < ksub_; k++) {
107 |     real z = (real)nelts[k];
108 |     if (z != 0) {
109 |       for (auto j = 0; j < d; j++) {
110 |         c[j] /= z;
111 |       }
112 |     }
113 |     c += d;
114 |   }
115 | 
116 |   std::uniform_real_distribution<> runiform(0, 1);
117 |   for (auto k = 0; k < ksub_; k++) {
118 |     if (nelts[k] == 0) {
119 |       int32_t m = 0;
120 |       while (runiform(rng) * (n - ksub_) >= nelts[m] - 1) {
121 |         m = (m + 1) % ksub_;
122 |       }
123 |       memcpy(centroids + k * d, centroids + m * d, sizeof(real) * d);
124 |       for (auto j = 0; j < d; j++) {
125 |         int32_t sign = (j % 2) * 2 - 1;
126 |         centroids[k * d + j] += sign * eps_;
127 |         centroids[m * d + j] -= sign * eps_;
128 |       }
129 |       nelts[k] = nelts[m] / 2;
130 |       nelts[m] -= nelts[k];
131 |     }
132 |   }
133 | }
134 | 
135 | void ProductQuantizer::kmeans(const real* x, real* c, int32_t n, int32_t d) {
136 |   std::vector<int32_t> perm(n, 0);
137 |   std::iota(perm.begin(), perm.end(), 0);
138 |   std::shuffle(perm.begin(), perm.end(), rng);
139 |   for (auto i = 0; i < ksub_; i++) {
140 |     memcpy(&c[i * d], x + perm[i] * d, d * sizeof(real));
141 |   }
142 |   auto codes = std::vector<uint8_t>(n);
143 |   for (auto i = 0; i < niter_; i++) {
144 |     Estep(x, c, codes.data(), d, n);
145 |     MStep(x, c, codes.data(), d, n);
146 |   }
147 | }
148 | 
149 | void ProductQuantizer::train(int32_t n, const real* x) {
150 |   if (n < ksub_) {
151 |     throw std::invalid_argument(
152 |         "Matrix too small for quantization, must have at least " +
153 |         std::to_string(ksub_) + " rows");
154 |   }
155 |   std::vector<int32_t> perm(n, 0);
156 |   std::iota(perm.begin(), perm.end(), 0);
157 |   auto d = dsub_;
158 |   auto np = std::min(n, max_points_);
159 |   auto xslice = std::vector<real>(np * dsub_);
160 |   for (auto m = 0; m < nsubq_; m++) {
161 |     if (m == nsubq_ - 1) {
162 |       d = lastdsub_;
163 |     }
164 |     if (np != n) {
165 |       std::shuffle(perm.begin(), perm.end(), rng);
166 |     }
167 |     for (auto j = 0; j < np; j++) {
168 |       memcpy(
169 |           xslice.data() + j * d,
170 |           x + perm[j] * dim_ + m * dsub_,
171 |           d * sizeof(real));
172 |     }
173 |     kmeans(xslice.data(), get_centroids(m, 0), np, d);
174 |   }
175 | }
176 | 
177 | real ProductQuantizer::mulcode(
178 |     const Vector& x,
179 |     const uint8_t* codes,
180 |     int32_t t,
181 |     real alpha) const {
182 |   real res = 0.0;
183 |   auto d = dsub_;
184 |   const uint8_t* code = codes + nsubq_ * t;
185 |   for (auto m = 0; m < nsubq_; m++) {
186 |     const real* c = get_centroids(m, code[m]);
187 |     if (m == nsubq_ - 1) {
188 |       d = lastdsub_;
189 |     }
190 |     for (auto n = 0; n < d; n++) {
191 |       res += x[m * dsub_ + n] * c[n];
192 |     }
193 |   }
194 |   return res * alpha;
195 | }
196 | 
197 | void ProductQuantizer::addcode(
198 |     Vector& x,
199 |     const uint8_t* codes,
200 |     int32_t t,
201 |     real alpha) const {
202 |   auto d = dsub_;
203 |   const uint8_t* code = codes + nsubq_ * t;
204 |   for (auto m = 0; m < nsubq_; m++) {
205 |     const real* c = get_centroids(m, code[m]);
206 |     if (m == nsubq_ - 1) {
207 |       d = lastdsub_;
208 |     }
209 |     for (auto n = 0; n < d; n++) {
210 |       x[m * dsub_ + n] += alpha * c[n];
211 |     }
212 |   }
213 | }
214 | 
215 | void ProductQuantizer::compute_code(const real* x, uint8_t* code) const {
216 |   auto d = dsub_;
217 |   for (auto m = 0; m < nsubq_; m++) {
218 |     if (m == nsubq_ - 1) {
219 |       d = lastdsub_;
220 |     }
221 |     assign_centroid(x + m * dsub_, get_centroids(m, 0), code + m, d);
222 |   }
223 | }
224 | 
225 | void ProductQuantizer::compute_codes(const real* x, uint8_t* codes, int32_t n)
226 |     const {
227 |   for (auto i = 0; i < n; i++) {
228 |     compute_code(x + i * dim_, codes + i * nsubq_);
229 |   }
230 | }
231 | 
232 | void ProductQuantizer::save(std::ostream& out) const {
233 |   out.write((char*)&dim_, sizeof(dim_));
234 |   out.write((char*)&nsubq_, sizeof(nsubq_));
235 |   out.write((char*)&dsub_, sizeof(dsub_));
236 |   out.write((char*)&lastdsub_, sizeof(lastdsub_));
237 |   out.write((char*)centroids_.data(), centroids_.size() * sizeof(real));
238 | }
239 | 
240 | void ProductQuantizer::load(std::istream& in) {
241 |   in.read((char*)&dim_, sizeof(dim_));
242 |   in.read((char*)&nsubq_, sizeof(nsubq_));
243 |   in.read((char*)&dsub_, sizeof(dsub_));
244 |   in.read((char*)&lastdsub_, sizeof(lastdsub_));
245 |   centroids_.resize(dim_ * ksub_);
246 |   for (auto i = 0; i < centroids_.size(); i++) {
247 |     in.read((char*)&centroids_[i], sizeof(real));
248 |   }
249 | }
250 | 
251 | } // namespace fasttext
252 | 


--------------------------------------------------------------------------------
/docs/reference/load_model.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Load an existing fastText trained model — load_model • fastrtext</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | 
 15 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
 16 | <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" integrity="sha256-eZrrJcwDc/3uDhsdt61sL2oOBY362qM3lon1gyExkL0=" crossorigin="anonymous" />
 20 | 
 21 | <!-- clipboard.js -->
 22 | <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script>
 23 | 
 24 | <!-- sticky kit -->
 25 | <script src="https://cdnjs.cloudflare.com/ajax/libs/sticky-kit/1.1.3/sticky-kit.min.js" integrity="sha256-c4Rlo1ZozqTPE2RLuvbusY3+SU1pQaJC0TjuhygMipw=" crossorigin="anonymous"></script>
 26 | 
 27 | <!-- pkgdown -->
 28 | <link href="../pkgdown.css" rel="stylesheet">
 29 | <script src="../pkgdown.js"></script>
 30 | 
 31 | 
 32 | 
 33 | <meta property="og:title" content="Load an existing fastText trained model — load_model" />
 34 | 
 35 | <meta property="og:description" content="Load and return a pointer to an existing model which will be used in other functions of this package." />
 36 | <meta name="twitter:card" content="summary" />
 37 | 
 38 | 
 39 | 
 40 | <!-- mathjax -->
 41 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
 42 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
 43 | 
 44 | <!--[if lt IE 9]>
 45 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 46 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 47 | <![endif]-->
 48 | 
 49 | 
 50 |   </head>
 51 | 
 52 |   <body>
 53 |     <div class="container template-reference-topic">
 54 |       <header>
 55 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 56 |   <div class="container">
 57 |     <div class="navbar-header">
 58 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 59 |         <span class="sr-only">Toggle navigation</span>
 60 |         <span class="icon-bar"></span>
 61 |         <span class="icon-bar"></span>
 62 |         <span class="icon-bar"></span>
 63 |       </button>
 64 |       <span class="navbar-brand">
 65 |         <a class="navbar-link" href="../index.html">fastrtext</a>
 66 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.3.1</span>
 67 |       </span>
 68 |     </div>
 69 | 
 70 |     <div id="navbar" class="navbar-collapse collapse">
 71 |       <ul class="nav navbar-nav">
 72 |         <li>
 73 |   <a href="../index.html">
 74 |     <span class="fa fa-home fa-lg"></span>
 75 |      
 76 |   </a>
 77 | </li>
 78 | <li>
 79 |   <a href="../reference/index.html">Reference</a>
 80 | </li>
 81 | <li class="dropdown">
 82 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 83 |     Articles
 84 |      
 85 |     <span class="caret"></span>
 86 |   </a>
 87 |   <ul class="dropdown-menu" role="menu">
 88 |     <li>
 89 |       <a href="../articles/list_commands.html">List of commands</a>
 90 |     </li>
 91 |     <li>
 92 |       <a href="../articles/supervised_learning.html">Supervised learning</a>
 93 |     </li>
 94 |     <li>
 95 |       <a href="../articles/unsupervised_learning.html">Unsupervised learning</a>
 96 |     </li>
 97 |   </ul>
 98 | </li>
 99 | <li>
100 |   <a href="../news/index.html">Changelog</a>
101 | </li>
102 |       </ul>
103 |       
104 |       <ul class="nav navbar-nav navbar-right">
105 |         <li>
106 |   <a href="https://github.com/pommedeterresautee/fastrtext">
107 |     <span class="fa fa-github fa-lg"></span>
108 |      
109 |   </a>
110 | </li>
111 |       </ul>
112 |       
113 |     </div><!--/.nav-collapse -->
114 |   </div><!--/.container -->
115 | </div><!--/.navbar -->
116 | 
117 |       
118 |       </header>
119 | 
120 | <div class="row">
121 |   <div class="col-md-9 contents">
122 |     <div class="page-header">
123 |     <h1>Load an existing fastText trained model</h1>
124 |     <small class="dont-index">Source: <a href='https://github.com/pommedeterresautee/fastrtext/blob/master/R/API.R'><code>R/API.R</code></a></small>
125 |     <div class="hidden name"><code>load_model.Rd</code></div>
126 |     </div>
127 | 
128 |     <div class="ref-description">
129 |     
130 |     <p>Load and return a pointer to an existing model which will be used in other functions of this package.</p>
131 |     
132 |     </div>
133 | 
134 |     <pre class="usage"><span class='fu'>load_model</span>(<span class='no'>path</span>)</pre>
135 |     
136 |     <h2 class="hasAnchor" id="arguments"><a class="anchor" href="#arguments"></a>Arguments</h2>
137 |     <table class="ref-arguments">
138 |     <colgroup><col class="name" /><col class="desc" /></colgroup>
139 |     <tr>
140 |       <th>path</th>
141 |       <td><p>path to the existing model</p></td>
142 |     </tr>
143 |     </table>
144 |     
145 | 
146 |     <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examples</h2>
147 |     <pre class="examples"><div class='input'>
148 | <span class='fu'><a href='https://www.rdocumentation.org/packages/base/topics/library'>library</a></span>(<span class='no'>fastrtext</span>)
149 | <span class='no'>model_test_path</span> <span class='kw'>&lt;-</span> <span class='fu'><a href='https://www.rdocumentation.org/packages/base/topics/system.file'>system.file</a></span>(<span class='st'>"extdata"</span>, <span class='st'>"model_classification_test.bin"</span>, <span class='kw'>package</span> <span class='kw'>=</span> <span class='st'>"fastrtext"</span>)
150 | <span class='no'>model</span> <span class='kw'>&lt;-</span> <span class='fu'>load_model</span>(<span class='no'>model_test_path</span>)</div></pre>
151 |   </div>
152 |   <div class="col-md-3 hidden-xs hidden-sm" id="sidebar">
153 |     <h2>Contents</h2>
154 |     <ul class="nav nav-pills nav-stacked">
155 |       <li><a href="#arguments">Arguments</a></li>
156 |             
157 |       <li><a href="#examples">Examples</a></li>
158 |     </ul>
159 | 
160 |   </div>
161 | </div>
162 | 
163 |       <footer>
164 |       <div class="copyright">
165 |   <p>Developed by Michaël Benesty.</p>
166 | </div>
167 | 
168 | <div class="pkgdown">
169 |   <p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.3.0.</p>
170 | </div>
171 |       </footer>
172 |    </div>
173 | 
174 |   
175 | 
176 |   </body>
177 | </html>
178 | 
179 | 


--------------------------------------------------------------------------------
/docs/reference/add_prefix.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Add a prefix to each word — add_prefix • fastrtext</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | 
 15 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
 16 | <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" integrity="sha256-eZrrJcwDc/3uDhsdt61sL2oOBY362qM3lon1gyExkL0=" crossorigin="anonymous" />
 20 | 
 21 | <!-- clipboard.js -->
 22 | <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script>
 23 | 
 24 | <!-- sticky kit -->
 25 | <script src="https://cdnjs.cloudflare.com/ajax/libs/sticky-kit/1.1.3/sticky-kit.min.js" integrity="sha256-c4Rlo1ZozqTPE2RLuvbusY3+SU1pQaJC0TjuhygMipw=" crossorigin="anonymous"></script>
 26 | 
 27 | <!-- pkgdown -->
 28 | <link href="../pkgdown.css" rel="stylesheet">
 29 | <script src="../pkgdown.js"></script>
 30 | 
 31 | 
 32 | 
 33 | <meta property="og:title" content="Add a prefix to each word — add_prefix" />
 34 | 
 35 | <meta property="og:description" content="Add a custom prefix to each word of a a line to create different spaces.
 36 | Code in C++ (efficient)." />
 37 | <meta name="twitter:card" content="summary" />
 38 | 
 39 | 
 40 | 
 41 | <!-- mathjax -->
 42 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
 43 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
 44 | 
 45 | <!--[if lt IE 9]>
 46 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 47 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 48 | <![endif]-->
 49 | 
 50 | 
 51 |   </head>
 52 | 
 53 |   <body>
 54 |     <div class="container template-reference-topic">
 55 |       <header>
 56 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 57 |   <div class="container">
 58 |     <div class="navbar-header">
 59 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 60 |         <span class="sr-only">Toggle navigation</span>
 61 |         <span class="icon-bar"></span>
 62 |         <span class="icon-bar"></span>
 63 |         <span class="icon-bar"></span>
 64 |       </button>
 65 |       <span class="navbar-brand">
 66 |         <a class="navbar-link" href="../index.html">fastrtext</a>
 67 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.3.1</span>
 68 |       </span>
 69 |     </div>
 70 | 
 71 |     <div id="navbar" class="navbar-collapse collapse">
 72 |       <ul class="nav navbar-nav">
 73 |         <li>
 74 |   <a href="../index.html">
 75 |     <span class="fa fa-home fa-lg"></span>
 76 |      
 77 |   </a>
 78 | </li>
 79 | <li>
 80 |   <a href="../reference/index.html">Reference</a>
 81 | </li>
 82 | <li class="dropdown">
 83 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 84 |     Articles
 85 |      
 86 |     <span class="caret"></span>
 87 |   </a>
 88 |   <ul class="dropdown-menu" role="menu">
 89 |     <li>
 90 |       <a href="../articles/list_commands.html">List of commands</a>
 91 |     </li>
 92 |     <li>
 93 |       <a href="../articles/supervised_learning.html">Supervised learning</a>
 94 |     </li>
 95 |     <li>
 96 |       <a href="../articles/unsupervised_learning.html">Unsupervised learning</a>
 97 |     </li>
 98 |   </ul>
 99 | </li>
100 | <li>
101 |   <a href="../news/index.html">Changelog</a>
102 | </li>
103 |       </ul>
104 |       
105 |       <ul class="nav navbar-nav navbar-right">
106 |         <li>
107 |   <a href="https://github.com/pommedeterresautee/fastrtext">
108 |     <span class="fa fa-github fa-lg"></span>
109 |      
110 |   </a>
111 | </li>
112 |       </ul>
113 |       
114 |     </div><!--/.nav-collapse -->
115 |   </div><!--/.container -->
116 | </div><!--/.navbar -->
117 | 
118 |       
119 |       </header>
120 | 
121 | <div class="row">
122 |   <div class="col-md-9 contents">
123 |     <div class="page-header">
124 |     <h1>Add a prefix to each word</h1>
125 |     <small class="dont-index">Source: <a href='https://github.com/pommedeterresautee/fastrtext/blob/master/R/RcppExports.R'><code>R/RcppExports.R</code></a></small>
126 |     <div class="hidden name"><code>add_prefix.Rd</code></div>
127 |     </div>
128 | 
129 |     <div class="ref-description">
130 |     
131 |     <p>Add a custom prefix to each word of a a line to create different spaces.
132 | Code in C++ (efficient).</p>
133 |     
134 |     </div>
135 | 
136 |     <pre class="usage"><span class='fu'>add_prefix</span>(<span class='no'>texts</span>, <span class='no'>prefix</span>)</pre>
137 |     
138 |     <h2 class="hasAnchor" id="arguments"><a class="anchor" href="#arguments"></a>Arguments</h2>
139 |     <table class="ref-arguments">
140 |     <colgroup><col class="name" /><col class="desc" /></colgroup>
141 |     <tr>
142 |       <th>texts</th>
143 |       <td><p>a <a href='https://www.rdocumentation.org/packages/base/topics/character'>character</a> containing the original text</p></td>
144 |     </tr>
145 |     <tr>
146 |       <th>prefix</th>
147 |       <td><p>unit <a href='https://www.rdocumentation.org/packages/base/topics/character'>character</a> containing the prefix to add (length == 1) or <a href='https://www.rdocumentation.org/packages/base/topics/character'>character</a> with same length than texts</p></td>
148 |     </tr>
149 |     </table>
150 |     
151 |     <h2 class="hasAnchor" id="value"><a class="anchor" href="#value"></a>Value</h2>
152 | 
153 |     <p><a href='https://www.rdocumentation.org/packages/base/topics/character'>character</a> with prefixed words.</p>
154 |     
155 | 
156 |     <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examples</h2>
157 |     <pre class="examples"><div class='input'><span class='fu'>add_prefix</span>(<span class='fu'><a href='https://www.rdocumentation.org/packages/base/topics/c'>c</a></span>(<span class='st'>"this is a test"</span>, <span class='st'>"this is another    test"</span>), <span class='st'>"#"</span>)</div><div class='output co'>#&gt; [1] "#this #is #a #test"          "#this #is #another    #test"</div></pre>
158 |   </div>
159 |   <div class="col-md-3 hidden-xs hidden-sm" id="sidebar">
160 |     <h2>Contents</h2>
161 |     <ul class="nav nav-pills nav-stacked">
162 |       <li><a href="#arguments">Arguments</a></li>
163 |       
164 |       <li><a href="#value">Value</a></li>
165 |       
166 |       <li><a href="#examples">Examples</a></li>
167 |     </ul>
168 | 
169 |   </div>
170 | </div>
171 | 
172 |       <footer>
173 |       <div class="copyright">
174 |   <p>Developed by Michaël Benesty.</p>
175 | </div>
176 | 
177 | <div class="pkgdown">
178 |   <p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.3.0.</p>
179 | </div>
180 |       </footer>
181 |    </div>
182 | 
183 |   
184 | 
185 |   </body>
186 | </html>
187 | 
188 | 


--------------------------------------------------------------------------------
/docs/reference/get_word_distance.html:
--------------------------------------------------------------------------------
  1 | <!-- Generated by pkgdown: do not edit by hand -->
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 |   <head>
  5 |   <meta charset="utf-8">
  6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 | 
  9 | <title>Distance between two words — get_word_distance • fastrtext</title>
 10 | 
 11 | <!-- jquery -->
 12 | <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
 13 | <!-- Bootstrap -->
 14 | 
 15 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha256-916EbMg70RQy9LHiGkXzG8hSg9EdNy97GazNG/aiY1w=" crossorigin="anonymous" />
 16 | <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha256-U5ZEeKfGNOja007MMD3YBI0A3OSZOQbeG6z2f2Y0hu8=" crossorigin="anonymous"></script>
 17 | 
 18 | <!-- Font Awesome icons -->
 19 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" integrity="sha256-eZrrJcwDc/3uDhsdt61sL2oOBY362qM3lon1gyExkL0=" crossorigin="anonymous" />
 20 | 
 21 | <!-- clipboard.js -->
 22 | <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js" integrity="sha256-FiZwavyI2V6+EXO1U+xzLG3IKldpiTFf3153ea9zikQ=" crossorigin="anonymous"></script>
 23 | 
 24 | <!-- sticky kit -->
 25 | <script src="https://cdnjs.cloudflare.com/ajax/libs/sticky-kit/1.1.3/sticky-kit.min.js" integrity="sha256-c4Rlo1ZozqTPE2RLuvbusY3+SU1pQaJC0TjuhygMipw=" crossorigin="anonymous"></script>
 26 | 
 27 | <!-- pkgdown -->
 28 | <link href="../pkgdown.css" rel="stylesheet">
 29 | <script src="../pkgdown.js"></script>
 30 | 
 31 | 
 32 | 
 33 | <meta property="og:title" content="Distance between two words — get_word_distance" />
 34 | 
 35 | <meta property="og:description" content="Distance is equal to 1 - cosine" />
 36 | <meta name="twitter:card" content="summary" />
 37 | 
 38 | 
 39 | 
 40 | <!-- mathjax -->
 41 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script>
 42 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script>
 43 | 
 44 | <!--[if lt IE 9]>
 45 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 46 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 47 | <![endif]-->
 48 | 
 49 | 
 50 |   </head>
 51 | 
 52 |   <body>
 53 |     <div class="container template-reference-topic">
 54 |       <header>
 55 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 56 |   <div class="container">
 57 |     <div class="navbar-header">
 58 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
 59 |         <span class="sr-only">Toggle navigation</span>
 60 |         <span class="icon-bar"></span>
 61 |         <span class="icon-bar"></span>
 62 |         <span class="icon-bar"></span>
 63 |       </button>
 64 |       <span class="navbar-brand">
 65 |         <a class="navbar-link" href="../index.html">fastrtext</a>
 66 |         <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Released version">0.3.1</span>
 67 |       </span>
 68 |     </div>
 69 | 
 70 |     <div id="navbar" class="navbar-collapse collapse">
 71 |       <ul class="nav navbar-nav">
 72 |         <li>
 73 |   <a href="../index.html">
 74 |     <span class="fa fa-home fa-lg"></span>
 75 |      
 76 |   </a>
 77 | </li>
 78 | <li>
 79 |   <a href="../reference/index.html">Reference</a>
 80 | </li>
 81 | <li class="dropdown">
 82 |   <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
 83 |     Articles
 84 |      
 85 |     <span class="caret"></span>
 86 |   </a>
 87 |   <ul class="dropdown-menu" role="menu">
 88 |     <li>
 89 |       <a href="../articles/list_commands.html">List of commands</a>
 90 |     </li>
 91 |     <li>
 92 |       <a href="../articles/supervised_learning.html">Supervised learning</a>
 93 |     </li>
 94 |     <li>
 95 |       <a href="../articles/unsupervised_learning.html">Unsupervised learning</a>
 96 |     </li>
 97 |   </ul>
 98 | </li>
 99 | <li>
100 |   <a href="../news/index.html">Changelog</a>
101 | </li>
102 |       </ul>
103 |       
104 |       <ul class="nav navbar-nav navbar-right">
105 |         <li>
106 |   <a href="https://github.com/pommedeterresautee/fastrtext">
107 |     <span class="fa fa-github fa-lg"></span>
108 |      
109 |   </a>
110 | </li>
111 |       </ul>
112 |       
113 |     </div><!--/.nav-collapse -->
114 |   </div><!--/.container -->
115 | </div><!--/.navbar -->
116 | 
117 |       
118 |       </header>
119 | 
120 | <div class="row">
121 |   <div class="col-md-9 contents">
122 |     <div class="page-header">
123 |     <h1>Distance between two words</h1>
124 |     <small class="dont-index">Source: <a href='https://github.com/pommedeterresautee/fastrtext/blob/master/R/API.R'><code>R/API.R</code></a></small>
125 |     <div class="hidden name"><code>get_word_distance.Rd</code></div>
126 |     </div>
127 | 
128 |     <div class="ref-description">
129 |     
130 |     <p>Distance is equal to <code>1 - cosine</code></p>
131 |     
132 |     </div>
133 | 
134 |     <pre class="usage"><span class='fu'>get_word_distance</span>(<span class='no'>model</span>, <span class='no'>w1</span>, <span class='no'>w2</span>)</pre>
135 |     
136 |     <h2 class="hasAnchor" id="arguments"><a class="anchor" href="#arguments"></a>Arguments</h2>
137 |     <table class="ref-arguments">
138 |     <colgroup><col class="name" /><col class="desc" /></colgroup>
139 |     <tr>
140 |       <th>model</th>
141 |       <td><p>trained <code>fastText</code> model. Null if train a new model.</p></td>
142 |     </tr>
143 |     <tr>
144 |       <th>w1</th>
145 |       <td><p>first word to compare</p></td>
146 |     </tr>
147 |     <tr>
148 |       <th>w2</th>
149 |       <td><p>second word to compare</p></td>
150 |     </tr>
151 |     </table>
152 |     
153 |     <h2 class="hasAnchor" id="value"><a class="anchor" href="#value"></a>Value</h2>
154 | 
155 |     <p>a <code>scalar</code> with the distance</p>
156 |     
157 | 
158 |     <h2 class="hasAnchor" id="examples"><a class="anchor" href="#examples"></a>Examples</h2>
159 |     <pre class="examples"><div class='input'>
160 | <span class='fu'><a href='https://www.rdocumentation.org/packages/base/topics/library'>library</a></span>(<span class='no'>fastrtext</span>)
161 | <span class='no'>model_test_path</span> <span class='kw'>&lt;-</span> <span class='fu'><a href='https://www.rdocumentation.org/packages/base/topics/system.file'>system.file</a></span>(<span class='st'>"extdata"</span>, <span class='st'>"model_unsupervised_test.bin"</span>, <span class='kw'>package</span> <span class='kw'>=</span> <span class='st'>"fastrtext"</span>)
162 | <span class='no'>model</span> <span class='kw'>&lt;-</span> <span class='fu'><a href='load_model.html'>load_model</a></span>(<span class='no'>model_test_path</span>)
163 | <span class='fu'>get_word_distance</span>(<span class='no'>model</span>, <span class='st'>"time"</span>, <span class='st'>"timing"</span>)</div><div class='output co'>#&gt;           [,1]
164 | #&gt; [1,] 0.5868116</div><div class='input'>
165 | </div></pre>
166 |   </div>
167 |   <div class="col-md-3 hidden-xs hidden-sm" id="sidebar">
168 |     <h2>Contents</h2>
169 |     <ul class="nav nav-pills nav-stacked">
170 |       <li><a href="#arguments">Arguments</a></li>
171 |       
172 |       <li><a href="#value">Value</a></li>
173 |       
174 |       <li><a href="#examples">Examples</a></li>
175 |     </ul>
176 | 
177 |   </div>
178 | </div>
179 | 
180 |       <footer>
181 |       <div class="copyright">
182 |   <p>Developed by Michaël Benesty.</p>
183 | </div>
184 | 
185 | <div class="pkgdown">
186 |   <p>Site built with <a href="https://pkgdown.r-lib.org/">pkgdown</a> 1.3.0.</p>
187 | </div>
188 |       </footer>
189 |    </div>
190 | 
191 |   
192 | 
193 |   </body>
194 | </html>
195 | 
196 | 


--------------------------------------------------------------------------------