├── .gitignore ├── .Rbuildignore ├── LICENSE ├── tests ├── testthat │ ├── queries.txt │ ├── analogy_queries.txt │ ├── save_model_vecs │ │ └── DONT_DELETE_THIS_FILE.txt │ ├── text_sentence.txt │ ├── cooking_valid.txt │ ├── example_text.txt │ ├── cooking_supervised.txt │ ├── declaration_human_rights_english.txt │ └── test-fasttext.R └── testthat.R ├── man ├── figures │ ├── skipgram_vs_cbow.png │ └── static_contextualised_word_embeddings.png ├── compute_elapsed_time.Rd ├── printUsage.Rd ├── inner_elapsed_time.Rd ├── printNNUsage.Rd ├── printDumpUsage.Rd ├── printTestUsage.Rd ├── printQuantizeUsage.Rd ├── printAnalogiesUsage.Rd ├── printTestLabelUsage.Rd ├── printPrintNgramsUsage.Rd ├── printPredictUsage.Rd ├── printPrintWordVectorsUsage.Rd ├── printPrintSentenceVectorsUsage.Rd ├── print_parameters.Rd ├── multiplot.Rd ├── give_args_fasttext.Rd ├── plot_progress_logs.Rd ├── language_identification.Rd └── fasttext_interface.Rd ├── vignettes ├── progress_fasttext.png └── language_identification │ ├── time.png │ ├── missing.png │ ├── accuracy.png │ ├── cld2_vec.png │ ├── cld3_vec.png │ ├── decl_dat.png │ ├── franc_res.png │ ├── cld2_vec_acc.png │ ├── cld3_vec_acc.png │ ├── dtbl_res_in.png │ ├── percentage.png │ ├── unique_textc.png │ ├── dtbl_multiling.png │ ├── dtbl_out_decl.png │ ├── franc_vec_decl.png │ ├── nams_profiles.png │ ├── textc_decl_acc.png │ ├── textc_decl_iso.png │ ├── dtbl_res_in_bin.png │ ├── dtbl_res_in_decl.png │ ├── isocodes_fasttext.png │ ├── merg_labels_cld2.png │ ├── merg_labels_cld3.png │ ├── merg_labels_textc.png │ ├── unique_textc_char.png │ ├── cat_head_willi_2018.png │ ├── dtbl_multiling_cld2.png │ ├── dtbl_multiling_cld3.png │ ├── franc_vec_decl_acc.png │ ├── nams_profiles_char.png │ ├── textc_decl_iso_trim.png │ ├── dtbl_multiling_franc.png │ ├── dtbl_res_in_decl_acc.png │ ├── list_files_willi_2018.png │ ├── merg_labels_textc_char.png │ ├── multilingual_sentence.png │ ├── names_tc_byte_profiles.png │ ├── names_tc_char_profiles.png │ ├── print_acc_merg_labels.png │ ├── print_acc_merg_labels_bin.png │ └── fasttext_benchmark_confussion_matr.png ├── inst ├── language_identification │ └── lid.176.ftz ├── include │ ├── real.h │ ├── utils.h │ ├── qmatrix.h │ ├── meter.h │ ├── vector.h │ ├── args.h │ ├── productquantizer.h │ ├── matrix.h │ ├── dictionary.h │ ├── model.h │ └── fasttext.h └── CITATION ├── R ├── fastText.R └── RcppExports.R ├── src ├── Makevars ├── Makevars.win ├── utils.cc ├── meter.cc ├── init.c ├── vector.cc ├── qmatrix.cc ├── matrix.cc ├── RcppExports.cpp ├── productquantizer.cc ├── args.cc ├── model.cc └── dictionary.cc ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ └── issue_template.md └── workflows │ ├── issue.yml │ ├── stale-actions.yml │ └── tic.yml ├── tic.R ├── NAMESPACE ├── DESCRIPTION ├── NEWS.md ├── README.Rmd └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | docs/ 2 | .Rhistory 3 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^\.ccache$ 2 | ^\.github$ 3 | ^tic\.R$ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2016-present 2 | COPYRIGHT HOLDER: Facebook, Inc. 3 | -------------------------------------------------------------------------------- /tests/testthat/queries.txt: -------------------------------------------------------------------------------- 1 | salt 2 | word 3 | pepper 4 | do 5 | not 6 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(fastText) 3 | 4 | test_check("fastText") 5 | -------------------------------------------------------------------------------- /man/figures/skipgram_vs_cbow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/man/figures/skipgram_vs_cbow.png -------------------------------------------------------------------------------- /vignettes/progress_fasttext.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/progress_fasttext.png -------------------------------------------------------------------------------- /tests/testthat/analogy_queries.txt: -------------------------------------------------------------------------------- 1 | cheese potato recipe 2 | beans soup tomato 3 | bread milk sugar 4 | salt pepper oil 5 | -------------------------------------------------------------------------------- /inst/language_identification/lid.176.ftz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/inst/language_identification/lid.176.ftz -------------------------------------------------------------------------------- /vignettes/language_identification/time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/time.png -------------------------------------------------------------------------------- /R/fastText.R: -------------------------------------------------------------------------------- 1 | #' @useDynLib fastText, .registration = TRUE 2 | #' @importFrom Rcpp evalCpp 3 | #' @importFrom utils globalVariables 4 | NULL 5 | -------------------------------------------------------------------------------- /vignettes/language_identification/missing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/missing.png -------------------------------------------------------------------------------- /vignettes/language_identification/accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/accuracy.png -------------------------------------------------------------------------------- /vignettes/language_identification/cld2_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/cld2_vec.png -------------------------------------------------------------------------------- /vignettes/language_identification/cld3_vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/cld3_vec.png -------------------------------------------------------------------------------- /vignettes/language_identification/decl_dat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/decl_dat.png -------------------------------------------------------------------------------- /vignettes/language_identification/franc_res.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/franc_res.png -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = -DNDEBUG 2 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 3 | CXX_STD = CXX11 4 | PKG_CPPFLAGS = -I../inst/include/ 5 | -------------------------------------------------------------------------------- /vignettes/language_identification/cld2_vec_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/cld2_vec_acc.png -------------------------------------------------------------------------------- /vignettes/language_identification/cld3_vec_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/cld3_vec_acc.png -------------------------------------------------------------------------------- /vignettes/language_identification/dtbl_res_in.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/dtbl_res_in.png -------------------------------------------------------------------------------- /vignettes/language_identification/percentage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/percentage.png -------------------------------------------------------------------------------- /vignettes/language_identification/unique_textc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/unique_textc.png -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = -DNDEBUG 2 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 3 | CXX_STD = CXX11 4 | PKG_CPPFLAGS = -I../inst/include/ 5 | -------------------------------------------------------------------------------- /vignettes/language_identification/dtbl_multiling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/dtbl_multiling.png -------------------------------------------------------------------------------- /vignettes/language_identification/dtbl_out_decl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/dtbl_out_decl.png -------------------------------------------------------------------------------- /vignettes/language_identification/franc_vec_decl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/franc_vec_decl.png -------------------------------------------------------------------------------- /vignettes/language_identification/nams_profiles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/nams_profiles.png -------------------------------------------------------------------------------- /vignettes/language_identification/textc_decl_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/textc_decl_acc.png -------------------------------------------------------------------------------- /vignettes/language_identification/textc_decl_iso.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/textc_decl_iso.png -------------------------------------------------------------------------------- /man/figures/static_contextualised_word_embeddings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/man/figures/static_contextualised_word_embeddings.png -------------------------------------------------------------------------------- /vignettes/language_identification/dtbl_res_in_bin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/dtbl_res_in_bin.png -------------------------------------------------------------------------------- /vignettes/language_identification/dtbl_res_in_decl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/dtbl_res_in_decl.png -------------------------------------------------------------------------------- /vignettes/language_identification/isocodes_fasttext.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/isocodes_fasttext.png -------------------------------------------------------------------------------- /vignettes/language_identification/merg_labels_cld2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/merg_labels_cld2.png -------------------------------------------------------------------------------- /vignettes/language_identification/merg_labels_cld3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/merg_labels_cld3.png -------------------------------------------------------------------------------- /vignettes/language_identification/merg_labels_textc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/merg_labels_textc.png -------------------------------------------------------------------------------- /vignettes/language_identification/unique_textc_char.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/unique_textc_char.png -------------------------------------------------------------------------------- /vignettes/language_identification/cat_head_willi_2018.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/cat_head_willi_2018.png -------------------------------------------------------------------------------- /vignettes/language_identification/dtbl_multiling_cld2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/dtbl_multiling_cld2.png -------------------------------------------------------------------------------- /vignettes/language_identification/dtbl_multiling_cld3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/dtbl_multiling_cld3.png -------------------------------------------------------------------------------- /vignettes/language_identification/franc_vec_decl_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/franc_vec_decl_acc.png -------------------------------------------------------------------------------- /vignettes/language_identification/nams_profiles_char.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/nams_profiles_char.png -------------------------------------------------------------------------------- /vignettes/language_identification/textc_decl_iso_trim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/textc_decl_iso_trim.png -------------------------------------------------------------------------------- /vignettes/language_identification/dtbl_multiling_franc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/dtbl_multiling_franc.png -------------------------------------------------------------------------------- /vignettes/language_identification/dtbl_res_in_decl_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/dtbl_res_in_decl_acc.png -------------------------------------------------------------------------------- /vignettes/language_identification/list_files_willi_2018.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/list_files_willi_2018.png -------------------------------------------------------------------------------- /vignettes/language_identification/merg_labels_textc_char.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/merg_labels_textc_char.png -------------------------------------------------------------------------------- /vignettes/language_identification/multilingual_sentence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/multilingual_sentence.png -------------------------------------------------------------------------------- /vignettes/language_identification/names_tc_byte_profiles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/names_tc_byte_profiles.png -------------------------------------------------------------------------------- /vignettes/language_identification/names_tc_char_profiles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/names_tc_char_profiles.png -------------------------------------------------------------------------------- /vignettes/language_identification/print_acc_merg_labels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/print_acc_merg_labels.png -------------------------------------------------------------------------------- /vignettes/language_identification/print_acc_merg_labels_bin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/print_acc_merg_labels_bin.png -------------------------------------------------------------------------------- /vignettes/language_identification/fasttext_benchmark_confussion_matr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlampros/fastText/HEAD/vignettes/language_identification/fasttext_benchmark_confussion_matr.png -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | # For more info see: https://docs.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository#configuring-the-template-chooser 2 | 3 | blank_issues_enabled: true 4 | -------------------------------------------------------------------------------- /tic.R: -------------------------------------------------------------------------------- 1 | # installs dependencies, runs R CMD check, runs covr::codecov() 2 | do_package_checks() 3 | 4 | if (ci_on_ghactions() && ci_has_env("BUILD_PKGDOWN")) { 5 | # creates pkgdown site and pushes to gh-pages branch 6 | # only for the runner with the "BUILD_PKGDOWN" env var set 7 | do_pkgdown() 8 | } 9 | -------------------------------------------------------------------------------- /tests/testthat/save_model_vecs/DONT_DELETE_THIS_FILE.txt: -------------------------------------------------------------------------------- 1 | 'testthat' removes empty folders when building the package and this folder is needed in order to save the results of the tests ( otherwise during package-checking I'll receive an error ). The data are saved in the 'fastText.Rcheck' folder which is created in any case in the user's workspace. 2 | -------------------------------------------------------------------------------- /inst/include/real.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | namespace fasttext { 12 | 13 | typedef float real; 14 | 15 | } 16 | -------------------------------------------------------------------------------- /tests/testthat/text_sentence.txt: -------------------------------------------------------------------------------- 1 | How much does potato starch affect a cheese sauce recipe 2 | Dangerous pathogens capable of growing in acidic environments 3 | How do I cover up the white spots on my cast iron stove 4 | How do I cover up the white spots on my cast iron stove 5 | Michelin Three Star Restaurant but if the chef is not there 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/issue_template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report or feature request 3 | about: Describe a bug you've encountered or make a case for a new feature 4 | --- 5 | 6 | Please briefly describe your problem and what output you expect. If you have a question, you also have the option of (but I'm flexible if it's not too complicated) 7 | 8 | Please include a minimal reproducible example 9 | 10 | Please give a brief description of the problem 11 | 12 | Please add your Operating System (e.g., Windows10, Macintosh, Linux) and the R version that you use (e.g., 3.6.2) 13 | -------------------------------------------------------------------------------- /man/compute_elapsed_time.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fasttext_interface.R 3 | \name{compute_elapsed_time} 4 | \alias{compute_elapsed_time} 5 | \title{elapsed time in hours & minutes & seconds} 6 | \usage{ 7 | compute_elapsed_time(time_start) 8 | } 9 | \arguments{ 10 | \item{time_start}{a numeric value specifying the start time} 11 | } 12 | \value{ 13 | It does not return a value but only prints the time in form of a character string in the R session 14 | } 15 | \description{ 16 | elapsed time in hours & minutes & seconds 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/printUsage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{printUsage} 4 | \alias{printUsage} 5 | \title{Print Usage Information for all parameters} 6 | \usage{ 7 | printUsage(verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{verbose}{if TRUE then information will be printed in the console} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters of the 'printUsage' function in the R session 14 | } 15 | \description{ 16 | Print Usage Information for all parameters 17 | } 18 | \examples{ 19 | 20 | library(fastText) 21 | 22 | printUsage() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/inner_elapsed_time.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fasttext_interface.R 3 | \name{inner_elapsed_time} 4 | \alias{inner_elapsed_time} 5 | \title{inner function of 'compute_elapsed_time'} 6 | \usage{ 7 | inner_elapsed_time(secs, estimated = FALSE) 8 | } 9 | \arguments{ 10 | \item{secs}{a numeric value specifying the seconds} 11 | 12 | \item{estimated}{a boolean. If TRUE then the output label becomes the 'Estimated time'} 13 | } 14 | \value{ 15 | a character string showing the estimated or elapsed time 16 | } 17 | \description{ 18 | inner function of 'compute_elapsed_time' 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /src/utils.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "utils.h" 10 | 11 | #include 12 | 13 | namespace fasttext { 14 | 15 | namespace utils { 16 | 17 | int64_t size(std::ifstream& ifs) { 18 | ifs.seekg(std::streamoff(0), std::ios::end); 19 | return ifs.tellg(); 20 | } 21 | 22 | void seek(std::ifstream& ifs, int64_t pos) { 23 | ifs.clear(); 24 | ifs.seekg(std::streampos(pos)); 25 | } 26 | } // namespace utils 27 | 28 | } // namespace fasttext 29 | -------------------------------------------------------------------------------- /man/printNNUsage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{printNNUsage} 4 | \alias{printNNUsage} 5 | \title{Print Usage Information when the command equals to 'nn'} 6 | \usage{ 7 | printNNUsage(verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{verbose}{if TRUE then information will be printed in the console} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters of the 'printNNUsage' function in the R session 14 | } 15 | \description{ 16 | Print Usage Information when the command equals to 'nn' 17 | } 18 | \examples{ 19 | 20 | library(fastText) 21 | 22 | printNNUsage() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/printDumpUsage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{printDumpUsage} 4 | \alias{printDumpUsage} 5 | \title{Print Usage Information when the command equals to 'dump'} 6 | \usage{ 7 | printDumpUsage(verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{verbose}{if TRUE then information will be printed in the console} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters of the 'printDumpUsage' function in the R session 14 | } 15 | \description{ 16 | Print Usage Information when the command equals to 'dump' 17 | } 18 | \examples{ 19 | 20 | library(fastText) 21 | 22 | printDumpUsage() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/printTestUsage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{printTestUsage} 4 | \alias{printTestUsage} 5 | \title{Print Usage Information when the command equals to 'test'} 6 | \usage{ 7 | printTestUsage(verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{verbose}{if TRUE then information will be printed in the console} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters of the 'printTestUsage' function in the R session 14 | } 15 | \description{ 16 | Print Usage Information when the command equals to 'test' 17 | } 18 | \examples{ 19 | 20 | library(fastText) 21 | 22 | printTestUsage() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/printQuantizeUsage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{printQuantizeUsage} 4 | \alias{printQuantizeUsage} 5 | \title{Print Usage Information when the command equals to 'quantize'} 6 | \usage{ 7 | printQuantizeUsage(verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{verbose}{if TRUE then information will be printed in the console} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters of the 'printQuantizeUsage' function in the R session 14 | } 15 | \description{ 16 | Print Usage Information when the command equals to 'quantize' 17 | } 18 | \examples{ 19 | 20 | library(fastText) 21 | 22 | printQuantizeUsage() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/printAnalogiesUsage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{printAnalogiesUsage} 4 | \alias{printAnalogiesUsage} 5 | \title{Print Usage Information when the command equals to 'analogies'} 6 | \usage{ 7 | printAnalogiesUsage(verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{verbose}{if TRUE then information will be printed in the console} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters of the 'printAnalogiesUsage' function in the R session 14 | } 15 | \description{ 16 | Print Usage Information when the command equals to 'analogies' 17 | } 18 | \examples{ 19 | 20 | library(fastText) 21 | 22 | printAnalogiesUsage() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/printTestLabelUsage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{printTestLabelUsage} 4 | \alias{printTestLabelUsage} 5 | \title{Print Usage Information when the command equals to 'test-label'} 6 | \usage{ 7 | printTestLabelUsage(verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{verbose}{if TRUE then information will be printed in the console} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters of the 'printTestLabelUsage' function in the R session 14 | } 15 | \description{ 16 | Print Usage Information when the command equals to 'test-label' 17 | } 18 | \examples{ 19 | 20 | library(fastText) 21 | 22 | printTestLabelUsage() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/printPrintNgramsUsage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{printPrintNgramsUsage} 4 | \alias{printPrintNgramsUsage} 5 | \title{Print Usage Information when the command equals to 'print-ngrams'} 6 | \usage{ 7 | printPrintNgramsUsage(verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{verbose}{if TRUE then information will be printed in the console} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters of the 'printPrintNgramsUsage' function in the R session 14 | } 15 | \description{ 16 | Print Usage Information when the command equals to 'print-ngrams' 17 | } 18 | \examples{ 19 | 20 | library(fastText) 21 | 22 | printPrintNgramsUsage() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/printPredictUsage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{printPredictUsage} 4 | \alias{printPredictUsage} 5 | \title{Print Usage Information when the command equals to 'predict' or 'predict-prob'} 6 | \usage{ 7 | printPredictUsage(verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{verbose}{if TRUE then information will be printed in the console} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters of the 'printPredictUsage' function in the R session 14 | } 15 | \description{ 16 | Print Usage Information when the command equals to 'predict' or 'predict-prob' 17 | } 18 | \examples{ 19 | 20 | library(fastText) 21 | 22 | printPredictUsage() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/printPrintWordVectorsUsage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{printPrintWordVectorsUsage} 4 | \alias{printPrintWordVectorsUsage} 5 | \title{Print Usage Information when the command equals to 'print-word-vectors'} 6 | \usage{ 7 | printPrintWordVectorsUsage(verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{verbose}{if TRUE then information will be printed in the console} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters of the 'printPrintWordVectorsUsage' function in the R session 14 | } 15 | \description{ 16 | Print Usage Information when the command equals to 'print-word-vectors' 17 | } 18 | \examples{ 19 | 20 | library(fastText) 21 | 22 | printPrintWordVectorsUsage() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /.github/workflows/issue.yml: -------------------------------------------------------------------------------- 1 | # For more info see: https://github.com/Renato66/auto-label 2 | # for the 'secrets.GITHUB_TOKEN' see: https://docs.github.com/en/actions/reference/authentication-in-a-workflow#about-the-github_token-secret 3 | 4 | name: Labeling new issue 5 | on: 6 | issues: 7 | types: ['opened'] 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: Renato66/auto-label@v2 13 | with: 14 | repo-token: ${{ secrets.GITHUB_TOKEN }} 15 | ignore-comments: true 16 | labels-synonyms: '{"bug":["error","need fix","not working"],"enhancement":["upgrade"],"question":["help","how can i"]}' 17 | labels-not-allowed: '["documentation","duplicate","good first issue","help wanted","invalid"]' 18 | default-labels: '["triage"]' 19 | -------------------------------------------------------------------------------- /man/printPrintSentenceVectorsUsage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{printPrintSentenceVectorsUsage} 4 | \alias{printPrintSentenceVectorsUsage} 5 | \title{Print Usage Information when the command equals to 'print-sentence-vectors'} 6 | \usage{ 7 | printPrintSentenceVectorsUsage(verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{verbose}{if TRUE then information will be printed in the console} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters of the 'printPrintSentenceVectorsUsage' function in the R session 14 | } 15 | \description{ 16 | Print Usage Information when the command equals to 'print-sentence-vectors' 17 | } 18 | \examples{ 19 | 20 | library(fastText) 21 | 22 | printPrintSentenceVectorsUsage() 23 | 24 | } 25 | -------------------------------------------------------------------------------- /man/print_parameters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fasttext_interface.R 3 | \name{print_parameters} 4 | \alias{print_parameters} 5 | \title{Print the parameters for a specific command} 6 | \usage{ 7 | print_parameters(command = "supervised") 8 | } 9 | \arguments{ 10 | \item{command}{a character string specifying the command for which the parameters should be printed in the R session. It should be one of "skipgram", "cbow", "supervised", "test", "test-label" or "quantize"} 11 | } 12 | \value{ 13 | It does not return a value but only prints the available parameters in the R session 14 | } 15 | \description{ 16 | Print the parameters for a specific command 17 | } 18 | \examples{ 19 | 20 | \dontrun{ 21 | 22 | library(fastText) 23 | 24 | print_parameters(command = 'supervised') 25 | } 26 | } 27 | \references{ 28 | https://github.com/facebookresearch/fastText#full-documentation 29 | 30 | https://github.com/facebookresearch/fastText/issues/341#issuecomment-339783130 31 | } 32 | -------------------------------------------------------------------------------- /man/multiplot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fasttext_interface.R 3 | \name{multiplot} 4 | \alias{multiplot} 5 | \title{Multiple plot function} 6 | \usage{ 7 | multiplot(..., plotlist = NULL, cols = 1, layout = NULL) 8 | } 9 | \arguments{ 10 | \item{...}{ellipsis to pass ggplot objects} 11 | 12 | \item{plotlist}{either NULL or a list of ggplot objects} 13 | 14 | \item{cols}{Number of columns in layout} 15 | 16 | \item{layout}{A matrix specifying the layout. If present, 'cols' is ignored} 17 | } 18 | \value{ 19 | It does not return a value but only shows the ggplots in the R session 20 | } 21 | \description{ 22 | Multiple plot function 23 | } 24 | \details{ 25 | If the layout is something like matrix(c(1,2,3,3), nrow = 2, byrow = TRUE), then plot 1 will 26 | go in the upper left, 2 will go in the upper right, and 3 will go all the way across the bottom. 27 | } 28 | \references{ 29 | http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/ 30 | } 31 | \keyword{internal} 32 | -------------------------------------------------------------------------------- /inst/include/utils.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #if defined(__clang__) || defined(__GNUC__) 17 | #define FASTTEXT_DEPRECATED(msg) __attribute__((__deprecated__(msg))) 18 | #elif defined(_MSC_VER) 19 | #define FASTTEXT_DEPRECATED(msg) __declspec(deprecated(msg)) 20 | #else 21 | #define FASTTEXT_DEPRECATED(msg) 22 | #endif 23 | 24 | namespace fasttext { 25 | 26 | namespace utils { 27 | 28 | int64_t size(std::ifstream&); 29 | 30 | void seek(std::ifstream&, int64_t); 31 | 32 | template 33 | bool contains(const std::vector& container, const T& value) { 34 | return std::find(container.begin(), container.end(), value) != 35 | container.end(); 36 | } 37 | 38 | } // namespace utils 39 | 40 | } // namespace fasttext 41 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(fasttext_interface) 4 | export(language_identification) 5 | export(plot_progress_logs) 6 | export(printAnalogiesUsage) 7 | export(printDumpUsage) 8 | export(printNNUsage) 9 | export(printPredictUsage) 10 | export(printPrintNgramsUsage) 11 | export(printPrintSentenceVectorsUsage) 12 | export(printPrintWordVectorsUsage) 13 | export(printQuantizeUsage) 14 | export(printTestLabelUsage) 15 | export(printTestUsage) 16 | export(printUsage) 17 | export(print_parameters) 18 | importFrom(Rcpp,evalCpp) 19 | importFrom(data.table,data.table) 20 | importFrom(data.table,fread) 21 | importFrom(ggplot2,aes) 22 | importFrom(ggplot2,element_text) 23 | importFrom(ggplot2,geom_line) 24 | importFrom(ggplot2,ggplot) 25 | importFrom(ggplot2,ggtitle) 26 | importFrom(ggplot2,theme) 27 | importFrom(glue,glue) 28 | importFrom(grid,grid.layout) 29 | importFrom(grid,grid.newpage) 30 | importFrom(grid,pushViewport) 31 | importFrom(grid,viewport) 32 | importFrom(stats,na.omit) 33 | importFrom(utils,globalVariables) 34 | importFrom(utils,read.table) 35 | useDynLib(fastText, .registration = TRUE) 36 | -------------------------------------------------------------------------------- /inst/include/qmatrix.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | #include "real.h" 19 | 20 | #include "matrix.h" 21 | #include "vector.h" 22 | 23 | #include "productquantizer.h" 24 | 25 | namespace fasttext { 26 | 27 | class QMatrix { 28 | protected: 29 | std::unique_ptr pq_; 30 | std::unique_ptr npq_; 31 | 32 | std::vector codes_; 33 | std::vector norm_codes_; 34 | 35 | bool qnorm_; 36 | 37 | int64_t m_; 38 | int64_t n_; 39 | 40 | int32_t codesize_; 41 | 42 | public: 43 | QMatrix(); 44 | QMatrix(const Matrix&, int32_t, bool); 45 | 46 | int64_t getM() const; 47 | int64_t getN() const; 48 | 49 | void quantizeNorm(const Vector&); 50 | void quantize(const Matrix&); 51 | 52 | void addToVector(Vector& x, int32_t t) const; 53 | real dotRow(const Vector&, int64_t) const; 54 | 55 | void save(std::ostream&); 56 | void load(std::istream&); 57 | }; 58 | 59 | } // namespace fasttext 60 | -------------------------------------------------------------------------------- /man/give_args_fasttext.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{give_args_fasttext} 4 | \alias{give_args_fasttext} 5 | \title{The Rcpp function which is used in the 'fasttext_interface' R function} 6 | \usage{ 7 | give_args_fasttext( 8 | args, 9 | pth = "", 10 | MilliSecs = 100L, 11 | pth_in = "", 12 | queryWord = "", 13 | remove_previous_file = TRUE 14 | ) 15 | } 16 | \arguments{ 17 | \item{args}{the arguments that will be passed to the function in form of a character vector} 18 | 19 | \item{pth}{a character string specifying the path where the process-logs (or output in generally) should be saved} 20 | 21 | \item{MilliSecs}{an integer specifying the delay in milliseconds when printing the results to the specified path_output} 22 | 23 | \item{pth_in}{a character string specifying the path to the input data file} 24 | 25 | \item{queryWord}{either an empty string or the queryword that should be passed to the function} 26 | 27 | \item{remove_previous_file}{a boolean. If TRUE, in case that the path_output is not an empty string (""), then an existing file with the same output name will be removed} 28 | } 29 | \value{ 30 | It does not return a value but only saves the results to a file 31 | } 32 | \description{ 33 | The Rcpp function which is used in the 'fasttext_interface' R function 34 | } 35 | \keyword{internal} 36 | -------------------------------------------------------------------------------- /tests/testthat/cooking_valid.txt: -------------------------------------------------------------------------------- 1 | __label__equipment __label__cast-iron How do I fix a cast iron pot that was heated empty for hours? 2 | __label__oven How does grill/broil mode in a convection oven work? 3 | __label__sauce __label__indian-cuisine __label__breakfast What are the names of the breakfast spreads used in Indian cuisine? 4 | __label__chili-peppers __label__spicy-hot How to get the most chili flavour out of a chili pepper? 5 | __label__bread What is the secret to baking bread with a very fine crumb? 6 | __label__eggs Are egg whites generally available at the store? 7 | __label__baking __label__bulk-cooking What are the differences between baking in bulk and baking in smaller amounts? 8 | __label__teflon I left a non-stick pan on the stove for an hour 9 | __label__roasting __label__peeling __label__chestnuts How to peel chestnuts? 10 | __label__roasting __label__beets How do I roast beets to easily remove the skins? 11 | __label__food-science __label__indian-cuisine Why are Parathas smeared with fat/oil within layers? 12 | __label__eggs __label__sugar __label__custard __label__pudding Custard Pudding tasting like raw eggs 13 | __label__substitutions __label__salt __label__curing Is Himalayan pink salt the same as the pink salt used for curing? 14 | __label__cleaning What is a mild detergent? 15 | __label__food-safety __label__meat __label__liver Soaking liver in milk: in or out of the fridge? 16 | -------------------------------------------------------------------------------- /man/plot_progress_logs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fasttext_interface.R 3 | \name{plot_progress_logs} 4 | \alias{plot_progress_logs} 5 | \title{Plot the progress of loss, learning-rate and word-counts} 6 | \usage{ 7 | plot_progress_logs(path_logs = "progress_data.txt", plot = FALSE) 8 | } 9 | \arguments{ 10 | \item{path_logs}{a character string specifying a valid path to a file where the progress-logs are saved} 11 | 12 | \item{plot}{a boolean specifying if the loss, learning-rate and word-counts should be plotted} 13 | } 14 | \value{ 15 | an object of class data.frame that includes the progress logs with columns 'progress', 'words_sec_thread', 'learning_rate' and 'loss' 16 | } 17 | \description{ 18 | Plot the progress of loss, learning-rate and word-counts 19 | } 20 | \examples{ 21 | 22 | \dontrun{ 23 | 24 | library(fastText) 25 | 26 | #----------------------------------------------------------------- 27 | # the 'progress_data.txt' file corresponds to the 'path_output' 28 | # parameter of the 'fasttext_interface()'. Therefore the user has 29 | # to run first the 'fasttext_interface()' function to save the 30 | # 'progress_data.txt' file to the desired folder. 31 | #----------------------------------------------------------------- 32 | 33 | res = plot_progress_logs(path = file.path(tempdir(), "progress_data.txt"), 34 | plot = TRUE) 35 | 36 | } 37 | } 38 | \references{ 39 | http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/ 40 | } 41 | -------------------------------------------------------------------------------- /.github/workflows/stale-actions.yml: -------------------------------------------------------------------------------- 1 | # for the 'secrets.GITHUB_TOKEN' see: https://docs.github.com/en/actions/reference/authentication-in-a-workflow#about-the-github_token-secret 2 | 3 | name: "Mark or close stale issues and PRs" 4 | 5 | on: 6 | schedule: 7 | - cron: "00 * * * *" 8 | 9 | jobs: 10 | stale: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/stale@v3 14 | with: 15 | repo-token: ${{ secrets.GITHUB_TOKEN }} 16 | days-before-stale: 12 17 | days-before-close: 7 18 | stale-issue-message: "This is Robo-lampros because the Human-lampros is lazy. This issue has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs. Feel free to re-open a closed issue and the Human-lampros will respond." 19 | stale-pr-message: "This is Robo-lampros because the Human-lampros is lazy. This PR has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs." 20 | close-issue-message: "This issue was automatically closed because of being stale. Feel free to re-open a closed issue and the Human-lampros will respond." 21 | close-pr-message: "This PR was automatically closed because of being stale." 22 | stale-pr-label: "stale" 23 | stale-issue-label: "stale" 24 | exempt-issue-labels: "bug,enhancement,pinned,security,pending,work_in_progress" 25 | exempt-pr-labels: "bug,enhancement,pinned,security,pending,work_in_progress" 26 | -------------------------------------------------------------------------------- /inst/include/meter.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | 14 | #include "dictionary.h" 15 | #include "real.h" 16 | 17 | namespace fasttext { 18 | 19 | class Meter { 20 | struct Metrics { 21 | uint64_t gold; 22 | uint64_t predicted; 23 | uint64_t predictedGold; 24 | 25 | Metrics() : gold(0), predicted(0), predictedGold(0) {} 26 | 27 | double precision() const { 28 | return predictedGold / double(predicted); 29 | } 30 | double recall() const { 31 | return predictedGold / double(gold); 32 | } 33 | double f1Score() const { 34 | return 2 * predictedGold / double(predicted + gold); 35 | } 36 | }; 37 | 38 | public: 39 | Meter() : metrics_(), nexamples_(0), labelMetrics_() {} 40 | 41 | void log( 42 | const std::vector& labels, 43 | const std::vector>& predictions); 44 | 45 | double precision(int32_t); 46 | double recall(int32_t); 47 | double f1Score(int32_t); 48 | double precision() const; 49 | double recall() const; 50 | uint64_t nexamples() const { 51 | return nexamples_; 52 | } 53 | void writeGeneralMetrics(std::ostream& out, int32_t k) const; 54 | 55 | private: 56 | Metrics metrics_{}; 57 | uint64_t nexamples_; 58 | std::unordered_map labelMetrics_; 59 | }; 60 | 61 | } // namespace fasttext 62 | -------------------------------------------------------------------------------- /inst/include/vector.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "real.h" 16 | 17 | namespace fasttext { 18 | 19 | class Matrix; 20 | class QMatrix; 21 | 22 | class Vector { 23 | protected: 24 | std::vector data_; 25 | 26 | public: 27 | explicit Vector(int64_t); 28 | Vector(const Vector&) = delete; 29 | Vector(Vector&&) noexcept; 30 | Vector& operator=(const Vector&) = delete; 31 | Vector& operator=(Vector&&); 32 | 33 | inline real* data() { 34 | return data_.data(); 35 | } 36 | inline const real* data() const { 37 | return data_.data(); 38 | } 39 | inline real& operator[](int64_t i) { 40 | return data_[i]; 41 | } 42 | inline const real& operator[](int64_t i) const { 43 | return data_[i]; 44 | } 45 | 46 | inline int64_t size() const { 47 | return data_.size(); 48 | } 49 | void zero(); 50 | void mul(real); 51 | real norm() const; 52 | void addVector(const Vector& source); 53 | void addVector(const Vector&, real); 54 | void addRow(const Matrix&, int64_t); 55 | void addRow(const QMatrix&, int64_t); 56 | void addRow(const Matrix&, int64_t, real); 57 | void mul(const QMatrix&, const Vector&); 58 | void mul(const Matrix&, const Vector&); 59 | int64_t argmax(); 60 | }; 61 | 62 | std::ostream& operator<<(std::ostream&, const Vector&); 63 | 64 | } // namespace fasttext 65 | -------------------------------------------------------------------------------- /inst/include/args.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace fasttext { 17 | 18 | enum class model_name : int { cbow = 1, sg, sup }; 19 | enum class loss_name : int { hs = 1, ns, softmax, ova }; 20 | 21 | class Args { 22 | protected: 23 | std::string lossToString(loss_name) const; 24 | std::string boolToString(bool) const; 25 | std::string modelToString(model_name) const; 26 | 27 | public: 28 | Args(); 29 | std::string input; 30 | std::string output; 31 | double lr; 32 | int lrUpdateRate; 33 | int dim; 34 | int ws; 35 | int epoch; 36 | int minCount; 37 | int minCountLabel; 38 | int neg; 39 | int wordNgrams; 40 | loss_name loss; 41 | model_name model; 42 | int bucket; 43 | int minn; 44 | int maxn; 45 | int thread; 46 | double t; 47 | std::string label; 48 | int verbose; 49 | std::string pretrainedVectors; 50 | bool saveOutput; 51 | 52 | bool qout; 53 | bool retrain; 54 | bool qnorm_param; 55 | size_t cutoff; 56 | size_t dsub; 57 | 58 | void parseArgs(const std::vector& args); 59 | void printHelp(); 60 | void printBasicHelp(); 61 | void printDictionaryHelp(); 62 | void printTrainingHelp(); 63 | void printQuantizationHelp(); 64 | void save(std::ostream&); 65 | void load(std::istream&); 66 | void dump(std::ostream&) const; 67 | }; 68 | } // namespace fasttext 69 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: fastText 2 | Type: Package 3 | Title: Efficient Learning of Word Representations and Sentence Classification 4 | Version: 1.0.4 5 | Date: 2023-01-30 6 | Authors@R: c( person("Lampros", "Mouselimis", email = "mouselimislampros@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "https://orcid.org/0000-0002-8024-1546")), person("Facebook", "Inc", role = "cph") ) 7 | URL: https://github.com/mlampros/fastText 8 | BugReports: https://github.com/mlampros/fastText/issues 9 | Description: An interface to the 'fastText' library for efficient learning of word representations and sentence classification. The 'fastText' algorithm is explained in detail in (i) "Enriching Word Vectors with subword Information", Piotr Bojanowski, Edouard Grave, Armand Joulin, Tomas Mikolov, 2017, ; (ii) "Bag of Tricks for Efficient Text Classification", Armand Joulin, Edouard Grave, Piotr Bojanowski, Tomas Mikolov, 2017, ; (iii) "FastText.zip: Compressing text classification models", Armand Joulin, Edouard Grave, Piotr Bojanowski, Matthijs Douze, Herve Jegou, Tomas Mikolov, 2016, . 10 | License: MIT + file LICENSE 11 | SystemRequirements: Generally, fastText builds on modern Mac OS and Linux distributions. Since it uses some C++11 features, it requires a compiler with good C++11 support. These include a (g++-4.7.2 or newer) or a (clang-3.3 or newer). 12 | Encoding: UTF-8 13 | Imports: 14 | Rcpp (>= 1.0.0), 15 | ggplot2, 16 | grid, 17 | utils, 18 | glue, 19 | data.table, 20 | stats 21 | Depends: R(>= 3.2.3) 22 | LinkingTo: Rcpp 23 | Suggests: 24 | testthat, 25 | covr, 26 | knitr, 27 | rmarkdown 28 | VignetteBuilder: knitr 29 | RoxygenNote: 7.2.3 30 | -------------------------------------------------------------------------------- /src/meter.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "meter.h" 10 | #include "utils.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace fasttext { 18 | 19 | void Meter::log( 20 | const std::vector& labels, 21 | const std::vector>& predictions) { 22 | nexamples_++; 23 | metrics_.gold += labels.size(); 24 | metrics_.predicted += predictions.size(); 25 | 26 | for (const auto& prediction : predictions) { 27 | labelMetrics_[prediction.second].predicted++; 28 | 29 | if (utils::contains(labels, prediction.second)) { 30 | labelMetrics_[prediction.second].predictedGold++; 31 | metrics_.predictedGold++; 32 | } 33 | } 34 | 35 | for (const auto& label : labels) { 36 | labelMetrics_[label].gold++; 37 | } 38 | } 39 | 40 | double Meter::precision(int32_t i) { 41 | return labelMetrics_[i].precision(); 42 | } 43 | 44 | double Meter::recall(int32_t i) { 45 | return labelMetrics_[i].recall(); 46 | } 47 | 48 | double Meter::f1Score(int32_t i) { 49 | return labelMetrics_[i].f1Score(); 50 | } 51 | 52 | double Meter::precision() const { 53 | return metrics_.precision(); 54 | } 55 | 56 | double Meter::recall() const { 57 | return metrics_.recall(); 58 | } 59 | 60 | void Meter::writeGeneralMetrics(std::ostream& out, int32_t k) const { 61 | out << "N" 62 | << "\t" << nexamples_ << std::endl; 63 | out << std::setprecision(3); 64 | out << "P@" << k << "\t" << metrics_.precision() << std::endl; 65 | out << "R@" << k << "\t" << metrics_.recall() << std::endl; 66 | } 67 | 68 | } // namespace fasttext 69 | -------------------------------------------------------------------------------- /inst/include/productquantizer.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "real.h" 18 | #include "vector.h" 19 | 20 | namespace fasttext { 21 | 22 | class ProductQuantizer { 23 | protected: 24 | const int32_t nbits_ = 8; 25 | const int32_t ksub_ = 1 << nbits_; 26 | const int32_t max_points_per_cluster_ = 256; 27 | const int32_t max_points_ = max_points_per_cluster_ * ksub_; 28 | const int32_t seed_ = 1234; 29 | const int32_t niter_ = 25; 30 | const real eps_ = 1e-7; 31 | 32 | int32_t dim_; 33 | int32_t nsubq_; 34 | int32_t dsub_; 35 | int32_t lastdsub_; 36 | 37 | std::vector centroids_; 38 | 39 | std::minstd_rand rng; 40 | 41 | public: 42 | ProductQuantizer() {} 43 | ProductQuantizer(int32_t, int32_t); 44 | 45 | real* get_centroids(int32_t, uint8_t); 46 | const real* get_centroids(int32_t, uint8_t) const; 47 | 48 | real assign_centroid(const real*, const real*, uint8_t*, int32_t) const; 49 | void Estep(const real*, const real*, uint8_t*, int32_t, int32_t) const; 50 | void MStep(const real*, real*, const uint8_t*, int32_t, int32_t); 51 | void kmeans(const real*, real*, int32_t, int32_t); 52 | void train(int, const real*); 53 | 54 | real mulcode(const Vector&, const uint8_t*, int32_t, real) const; 55 | void addcode(Vector&, const uint8_t*, int32_t, real) const; 56 | void compute_code(const real*, uint8_t*) const; 57 | void compute_codes(const real*, uint8_t*, int32_t) const; 58 | 59 | void save(std::ostream&); 60 | void load(std::istream&); 61 | }; 62 | 63 | } // namespace fasttext 64 | -------------------------------------------------------------------------------- /inst/include/matrix.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include "real.h" 18 | 19 | namespace fasttext { 20 | 21 | class Vector; 22 | 23 | class Matrix { 24 | protected: 25 | std::vector data_; 26 | const int64_t m_; 27 | const int64_t n_; 28 | 29 | public: 30 | Matrix(); 31 | explicit Matrix(int64_t, int64_t); 32 | Matrix(const Matrix&) = default; 33 | Matrix& operator=(const Matrix&) = delete; 34 | 35 | inline real* data() { 36 | return data_.data(); 37 | } 38 | inline const real* data() const { 39 | return data_.data(); 40 | } 41 | 42 | inline const real& at(int64_t i, int64_t j) const { 43 | return data_[i * n_ + j]; 44 | }; 45 | inline real& at(int64_t i, int64_t j) { 46 | return data_[i * n_ + j]; 47 | }; 48 | 49 | inline int64_t size(int64_t dim) const { 50 | assert(dim == 0 || dim == 1); 51 | if (dim == 0) { 52 | return m_; 53 | } 54 | return n_; 55 | } 56 | inline int64_t rows() const { 57 | return m_; 58 | } 59 | inline int64_t cols() const { 60 | return n_; 61 | } 62 | void zero(); 63 | void uniform(real); 64 | real dotRow(const Vector&, int64_t) const; 65 | void addRow(const Vector&, int64_t, real); 66 | 67 | void multiplyRow(const Vector& nums, int64_t ib = 0, int64_t ie = -1); 68 | void divideRow(const Vector& denoms, int64_t ib = 0, int64_t ie = -1); 69 | 70 | real l2NormRow(int64_t i) const; 71 | void l2NormRow(Vector& norms) const; 72 | 73 | void save(std::ostream&); 74 | void load(std::istream&); 75 | 76 | void dump(std::ostream&) const; 77 | }; 78 | } // namespace fasttext 79 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("Please cite both the package and the original articles / software in your publications:") 2 | 3 | year <- sub("-.*", "", meta$Date) 4 | note <- sprintf("R package version %s", meta$Version) 5 | 6 | bibentry( 7 | bibtype = "Manual", 8 | title = "{fastText}: Efficient Learning of Word Representations and Sentence Classification using R", 9 | author = person("Lampros", "Mouselimis"), 10 | year = year, 11 | note = note, 12 | url = "https://CRAN.R-project.org/package=fastText" 13 | ) 14 | 15 | bibentry( 16 | bibtype = "Manual", 17 | title = "{fastText}: Library for fast text representation and classification", 18 | author = person("Inc", "Facebook"), 19 | year = 2016, 20 | url = "https://github.com/facebookresearch/fastText" 21 | ) 22 | 23 | bibentry( 24 | bibtype = "Article", 25 | title = "Enriching Word Vectors with Subword Information", 26 | author = c(as.person("Piotr Bojanowski"), as.person("Edouard Grave"), as.person("Armand Joulin"), as.person("Tomas Mikolov")), 27 | journal = "Transactions of the Association for Computational Linguistics", 28 | year = "2017", 29 | volume = "5", 30 | pages = "135--146", 31 | doi = "10.1162/tacl_a_00051" 32 | ) 33 | 34 | bibentry( 35 | bibtype = "InProceedings", 36 | title = "Bag of Tricks for Efficient Text Classification", 37 | author = c(as.person("Piotr Bojanowski"), as.person("Edouard Grave"), as.person("Armand Joulin"), as.person("Tomas Mikolov")), 38 | booktitle = "Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers", 39 | year = "2017", 40 | publisher = "Association for Computational Linguistics", 41 | pages = "427--431" 42 | ) 43 | 44 | bibentry( 45 | bibtype = "Article", 46 | title = "FastText.zip: Compressing text classification models", 47 | author = c(as.person("Piotr Bojanowski"), as.person("Edouard Grave"), as.person("Armand Joulin"), as.person("Tomas Mikolov"), as.person("Matthijs Douze"), as.person("Herve Jegou")), 48 | journal = "arXiv preprint arXiv:1612.03651", 49 | year = "2016" 50 | ) 51 | -------------------------------------------------------------------------------- /tests/testthat/example_text.txt: -------------------------------------------------------------------------------- 1 | The term planet is ancient, with ties to history, astrology, science, mythology, and religion. Several planets in the Solar System can be seen with the naked eye. These were regarded by many early cultures as divine, or as emissaries of deities. As scientific knowledge advanced, human perception of the planets changed, incorporating a number of disparate objects. In 2006, the International Astronomical Union (IAU) officially adopted a resolution defining planets within the Solar System. This definition is controversial because it excludes many objects of planetary mass based on where or what they orbit. 2 | Although eight of the planetary bodies discovered before 1950 remain planets under the modern definition, some celestial bodies, such as Ceres, Pallas, Juno and Vesta (each an object in the solar asteroid belt), and Pluto (the first trans-Neptunian object discovered), that were once considered planets by the scientific community, are no longer viewed as such. 3 | The planets were thought by Ptolemy to orbit Earth in deferent and epicycle motions. Although the idea that the planets orbited the Sun had been suggested many times, it was not until the 17th century that this view was supported by evidence from the first telescopic astronomical observations, performed by Galileo Galilei. 4 | At about the same time, by careful analysis of pre-telescopic observation data collected by Tycho Brahe, Johannes Kepler found the planets orbits were not circular but elliptical. As observational tools improved, astronomers saw that, like Earth, the planets rotated around tilted axes, and some shared such features as ice caps and seasons. Since the dawn of the Space Age, close observation by space probes has found that Earth and the other planets share characteristics such as volcanism, hurricanes, tectonics, and even hydrology. 5 | Planets are generally divided into two main types: large lowdensity giant planets, and smaller rocky terrestrials. Under IAU definitions, there are eight planets in the Solar System. In order of increasing distance from the Sun, they are the four terrestrials, Mercury, Venus, Earth, and Mars, then the four giant planets, Jupiter, Saturn, Uranus, and Neptune. Six of the planets are orbited by one or more natural satellites. 6 | -------------------------------------------------------------------------------- /man/language_identification.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fasttext_interface.R 3 | \name{language_identification} 4 | \alias{language_identification} 5 | \title{Language Identification using fastText} 6 | \usage{ 7 | language_identification( 8 | input_obj, 9 | pre_trained_language_model_path, 10 | k = 1, 11 | th = 0, 12 | threads = 1, 13 | verbose = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{input_obj}{either a valid character string to a valid path where each line represents a different text extract or a vector of text extracts} 18 | 19 | \item{pre_trained_language_model_path}{a valid character string to the pre-trained language identification model path, for more info see https://fasttext.cc/docs/en/language-identification.html} 20 | 21 | \item{k}{predict top k labels (1 by default)} 22 | 23 | \item{th}{probability threshold (0.0 by default)} 24 | 25 | \item{threads}{an integer specifying the number of threads to run in parallel. This parameter applies only if k > 1} 26 | 27 | \item{verbose}{if TRUE then information will be printed out in the console} 28 | } 29 | \value{ 30 | an object of class data.table which includes two or more columns with the names 'iso_lang_N' and 'prob_N' where 'N' corresponds to 1 to 'k' input parameter 31 | } 32 | \description{ 33 | Language Identification using fastText 34 | } 35 | \examples{ 36 | 37 | library(fastText) 38 | 39 | vec_txt = c("Incapaz de distinguir la luna y la cara de esta chica, 40 | Las estrellas se ponen nerviosas en el cielo", 41 | "Unable to tell apart the moon and this girl's face, 42 | Stars are flustered up in the sky.") 43 | 44 | file_pretrained = system.file("language_identification/lid.176.ftz", package = "fastText") 45 | 46 | dtbl_out = language_identification(input_obj = vec_txt, 47 | pre_trained_language_model_path = file_pretrained, 48 | k = 3, 49 | th = 0.0, 50 | verbose = TRUE) 51 | dtbl_out 52 | } 53 | \references{ 54 | https://fasttext.cc/docs/en/language-identification.html 55 | https://becominghuman.ai/a-handy-pre-trained-model-for-language-identification-cadd89db9db8 56 | } 57 | -------------------------------------------------------------------------------- /src/init.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include // for NULL 4 | #include 5 | 6 | /* FIXME: 7 | Check these declarations against the C/Fortran source code. 8 | */ 9 | 10 | /* .Call calls */ 11 | extern SEXP _fastText_give_args_fasttext(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); 12 | extern SEXP _fastText_printAnalogiesUsage(SEXP); 13 | extern SEXP _fastText_printDumpUsage(SEXP); 14 | extern SEXP _fastText_printNNUsage(SEXP); 15 | extern SEXP _fastText_printPredictUsage(SEXP); 16 | extern SEXP _fastText_printPrintNgramsUsage(SEXP); 17 | extern SEXP _fastText_printPrintSentenceVectorsUsage(SEXP); 18 | extern SEXP _fastText_printPrintWordVectorsUsage(SEXP); 19 | extern SEXP _fastText_printQuantizeUsage(SEXP); 20 | extern SEXP _fastText_printTestLabelUsage(SEXP); 21 | extern SEXP _fastText_printTestUsage(SEXP); 22 | extern SEXP _fastText_printUsage(SEXP); 23 | 24 | static const R_CallMethodDef CallEntries[] = { 25 | {"_fastText_give_args_fasttext", (DL_FUNC) &_fastText_give_args_fasttext, 6}, 26 | {"_fastText_printAnalogiesUsage", (DL_FUNC) &_fastText_printAnalogiesUsage, 1}, 27 | {"_fastText_printDumpUsage", (DL_FUNC) &_fastText_printDumpUsage, 1}, 28 | {"_fastText_printNNUsage", (DL_FUNC) &_fastText_printNNUsage, 1}, 29 | {"_fastText_printPredictUsage", (DL_FUNC) &_fastText_printPredictUsage, 1}, 30 | {"_fastText_printPrintNgramsUsage", (DL_FUNC) &_fastText_printPrintNgramsUsage, 1}, 31 | {"_fastText_printPrintSentenceVectorsUsage", (DL_FUNC) &_fastText_printPrintSentenceVectorsUsage, 1}, 32 | {"_fastText_printPrintWordVectorsUsage", (DL_FUNC) &_fastText_printPrintWordVectorsUsage, 1}, 33 | {"_fastText_printQuantizeUsage", (DL_FUNC) &_fastText_printQuantizeUsage, 1}, 34 | {"_fastText_printTestLabelUsage", (DL_FUNC) &_fastText_printTestLabelUsage, 1}, 35 | {"_fastText_printTestUsage", (DL_FUNC) &_fastText_printTestUsage, 1}, 36 | {"_fastText_printUsage", (DL_FUNC) &_fastText_printUsage, 1}, 37 | {NULL, NULL, 0} 38 | }; 39 | 40 | void R_init_fastText(DllInfo *dll) 41 | { 42 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 43 | R_useDynamicSymbols(dll, FALSE); 44 | } 45 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | 2 | ## fastText 1.0.4 3 | 4 | * I added a figure to the README.md file showing the differences between *static* and *contextualised* word embeddings 5 | 6 | 7 | ## fastText 1.0.3 8 | 9 | * I added a test case for the *'language_identification()'* function (see Github issue: https://github.com/mlampros/fastText/issues/3) 10 | * I added the '*verbose*' parameter to the C++ functions of the '*src/main.cc*' file that did not take any variables as input to avoid the CRAN WARNING *'function declaration isn't a prototype [-Wstrict-prototypes]'* 11 | 12 | 13 | ## fastText 1.0.2 14 | 15 | * I added the *URL* and *BugReports* fields in the *DESCRIPTION* file 16 | * I updated the documentation of the *print_parameters()* function 17 | * I updated the details section of the *fasttext_interface()* function regarding the *output* parameter which exists in the named list that is passed to the *list_params* parameter of the *fasttext_interface()* function. Although this *output* parameter is a file path (and not a directory name) it will be saved in both *.vec* and *.bin* file name extensions. 18 | 19 | 20 | ## fastText 1.0.1 21 | 22 | * I fixed the **LTO** (Link Optimization Error) - Additional Issues - by replacing the **qnorm** variable with **qnorm_param** in the Rcpp files 23 | * I modified the **quantize** function in the Rcpp file to return the **.ftz** file by specifying the exact file path 24 | * I adjusted the **Examples** section of the **fasttext_interface** function and the **testthat tests** to account for the changes in the **quantize** function 25 | * I fixed a bug of the **fasttext_interface** function related to parameters that do not take a value 26 | 27 | 28 | ## fastText 1.0.0 29 | 30 | * I've added the *CITATION* file in the 'inst' directory 31 | * I've added the **language_identification()** function 32 | * **20-04-2021** : I've added the pre-trained language identification model **lid.176.ftz** which can be downloaded from https://fasttext.cc/docs/en/language-identification.html In the same website exists also the **lid.176.bin** model which is bigger in size, faster and slightly more accurate. 33 | * **14-07-2019** : I fixed typos in vignette and modified the *plot_progress_logs()* function because it threw an error of the form : *line 1 did not have 11 elements* ( I added the *fill = TRUE* parameter to the *utils::read.table()* function to account for NA's as described in a [stackoverflow issue](https://stackoverflow.com/a/18161099/8302386) ) 34 | -------------------------------------------------------------------------------- /src/vector.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "vector.h" 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | #include "matrix.h" 18 | #include "qmatrix.h" 19 | 20 | namespace fasttext { 21 | 22 | Vector::Vector(int64_t m) : data_(m) {} 23 | 24 | Vector::Vector(Vector&& other) noexcept : data_(std::move(other.data_)) {} 25 | 26 | Vector& Vector::operator=(Vector&& other) { 27 | data_ = std::move(other.data_); 28 | return *this; 29 | } 30 | 31 | void Vector::zero() { 32 | std::fill(data_.begin(), data_.end(), 0.0); 33 | } 34 | 35 | real Vector::norm() const { 36 | real sum = 0; 37 | for (int64_t i = 0; i < size(); i++) { 38 | sum += data_[i] * data_[i]; 39 | } 40 | return std::sqrt(sum); 41 | } 42 | 43 | void Vector::mul(real a) { 44 | for (int64_t i = 0; i < size(); i++) { 45 | data_[i] *= a; 46 | } 47 | } 48 | 49 | void Vector::addVector(const Vector& source) { 50 | assert(size() == source.size()); 51 | for (int64_t i = 0; i < size(); i++) { 52 | data_[i] += source.data_[i]; 53 | } 54 | } 55 | 56 | void Vector::addVector(const Vector& source, real s) { 57 | assert(size() == source.size()); 58 | for (int64_t i = 0; i < size(); i++) { 59 | data_[i] += s * source.data_[i]; 60 | } 61 | } 62 | 63 | void Vector::addRow(const Matrix& A, int64_t i) { 64 | assert(i >= 0); 65 | assert(i < A.size(0)); 66 | assert(size() == A.size(1)); 67 | for (int64_t j = 0; j < A.size(1); j++) { 68 | data_[j] += A.at(i, j); 69 | } 70 | } 71 | 72 | void Vector::addRow(const Matrix& A, int64_t i, real a) { 73 | assert(i >= 0); 74 | assert(i < A.size(0)); 75 | assert(size() == A.size(1)); 76 | for (int64_t j = 0; j < A.size(1); j++) { 77 | data_[j] += a * A.at(i, j); 78 | } 79 | } 80 | 81 | void Vector::addRow(const QMatrix& A, int64_t i) { 82 | assert(i >= 0); 83 | A.addToVector(*this, i); 84 | } 85 | 86 | void Vector::mul(const Matrix& A, const Vector& vec) { 87 | assert(A.size(0) == size()); 88 | assert(A.size(1) == vec.size()); 89 | for (int64_t i = 0; i < size(); i++) { 90 | data_[i] = A.dotRow(vec, i); 91 | } 92 | } 93 | 94 | void Vector::mul(const QMatrix& A, const Vector& vec) { 95 | assert(A.getM() == size()); 96 | assert(A.getN() == vec.size()); 97 | for (int64_t i = 0; i < size(); i++) { 98 | data_[i] = A.dotRow(vec, i); 99 | } 100 | } 101 | 102 | int64_t Vector::argmax() { 103 | real max = data_[0]; 104 | int64_t argmax = 0; 105 | for (int64_t i = 1; i < size(); i++) { 106 | if (data_[i] > max) { 107 | max = data_[i]; 108 | argmax = i; 109 | } 110 | } 111 | return argmax; 112 | } 113 | 114 | std::ostream& operator<<(std::ostream& os, const Vector& v) { 115 | os << std::setprecision(5); 116 | for (int64_t j = 0; j < v.size(); j++) { 117 | os << v[j] << ' '; 118 | } 119 | return os; 120 | } 121 | 122 | } // namespace fasttext 123 | -------------------------------------------------------------------------------- /src/qmatrix.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "qmatrix.h" 10 | 11 | #include 12 | #include 13 | 14 | namespace fasttext { 15 | 16 | QMatrix::QMatrix() : qnorm_(false), m_(0), n_(0), codesize_(0) {} 17 | 18 | QMatrix::QMatrix(const Matrix& mat, int32_t dsub, bool qnorm_param) 19 | : qnorm_(qnorm_param), 20 | m_(mat.size(0)), 21 | n_(mat.size(1)), 22 | codesize_(m_ * ((n_ + dsub - 1) / dsub)) { 23 | codes_.resize(codesize_); 24 | pq_ = std::unique_ptr(new ProductQuantizer(n_, dsub)); 25 | if (qnorm_) { 26 | norm_codes_.resize(m_); 27 | npq_ = std::unique_ptr(new ProductQuantizer(1, 1)); 28 | } 29 | quantize(mat); 30 | } 31 | 32 | void QMatrix::quantizeNorm(const Vector& norms) { 33 | assert(qnorm_); 34 | assert(norms.size() == m_); 35 | auto dataptr = norms.data(); 36 | npq_->train(m_, dataptr); 37 | npq_->compute_codes(dataptr, norm_codes_.data(), m_); 38 | } 39 | 40 | void QMatrix::quantize(const Matrix& matrix) { 41 | assert(m_ == matrix.size(0)); 42 | assert(n_ == matrix.size(1)); 43 | Matrix temp(matrix); 44 | if (qnorm_) { 45 | Vector norms(temp.size(0)); 46 | temp.l2NormRow(norms); 47 | temp.divideRow(norms); 48 | quantizeNorm(norms); 49 | } 50 | auto dataptr = temp.data(); 51 | pq_->train(m_, dataptr); 52 | pq_->compute_codes(dataptr, codes_.data(), m_); 53 | } 54 | 55 | void QMatrix::addToVector(Vector& x, int32_t t) const { 56 | real norm = 1; 57 | if (qnorm_) { 58 | norm = npq_->get_centroids(0, norm_codes_[t])[0]; 59 | } 60 | pq_->addcode(x, codes_.data(), t, norm); 61 | } 62 | 63 | real QMatrix::dotRow(const Vector& vec, int64_t i) const { 64 | assert(i >= 0); 65 | assert(i < m_); 66 | assert(vec.size() == n_); 67 | real norm = 1; 68 | if (qnorm_) { 69 | norm = npq_->get_centroids(0, norm_codes_[i])[0]; 70 | } 71 | return pq_->mulcode(vec, codes_.data(), i, norm); 72 | } 73 | 74 | int64_t QMatrix::getM() const { 75 | return m_; 76 | } 77 | 78 | int64_t QMatrix::getN() const { 79 | return n_; 80 | } 81 | 82 | void QMatrix::save(std::ostream& out) { 83 | out.write((char*)&qnorm_, sizeof(qnorm_)); 84 | out.write((char*)&m_, sizeof(m_)); 85 | out.write((char*)&n_, sizeof(n_)); 86 | out.write((char*)&codesize_, sizeof(codesize_)); 87 | out.write((char*)codes_.data(), codesize_ * sizeof(uint8_t)); 88 | pq_->save(out); 89 | if (qnorm_) { 90 | out.write((char*)norm_codes_.data(), m_ * sizeof(uint8_t)); 91 | npq_->save(out); 92 | } 93 | } 94 | 95 | void QMatrix::load(std::istream& in) { 96 | in.read((char*)&qnorm_, sizeof(qnorm_)); 97 | in.read((char*)&m_, sizeof(m_)); 98 | in.read((char*)&n_, sizeof(n_)); 99 | in.read((char*)&codesize_, sizeof(codesize_)); 100 | codes_ = std::vector(codesize_); 101 | in.read((char*)codes_.data(), codesize_ * sizeof(uint8_t)); 102 | pq_ = std::unique_ptr(new ProductQuantizer()); 103 | pq_->load(in); 104 | if (qnorm_) { 105 | norm_codes_ = std::vector(m_); 106 | in.read((char*)norm_codes_.data(), m_ * sizeof(uint8_t)); 107 | npq_ = std::unique_ptr(new ProductQuantizer()); 108 | npq_->load(in); 109 | } 110 | } 111 | 112 | } // namespace fasttext 113 | -------------------------------------------------------------------------------- /src/matrix.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "matrix.h" 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "utils.h" 16 | #include "vector.h" 17 | 18 | namespace fasttext { 19 | 20 | Matrix::Matrix() : Matrix(0, 0) {} 21 | 22 | Matrix::Matrix(int64_t m, int64_t n) : data_(m * n), m_(m), n_(n) {} 23 | 24 | void Matrix::zero() { 25 | std::fill(data_.begin(), data_.end(), 0.0); 26 | } 27 | 28 | void Matrix::uniform(real a) { 29 | std::minstd_rand rng(1); 30 | std::uniform_real_distribution<> uniform(-a, a); 31 | for (int64_t i = 0; i < (m_ * n_); i++) { 32 | data_[i] = uniform(rng); 33 | } 34 | } 35 | 36 | real Matrix::dotRow(const Vector& vec, int64_t i) const { 37 | assert(i >= 0); 38 | assert(i < m_); 39 | assert(vec.size() == n_); 40 | real d = 0.0; 41 | for (int64_t j = 0; j < n_; j++) { 42 | d += at(i, j) * vec[j]; 43 | } 44 | if (std::isnan(d)) { 45 | throw std::runtime_error("Encountered NaN."); 46 | } 47 | return d; 48 | } 49 | 50 | void Matrix::addRow(const Vector& vec, int64_t i, real a) { 51 | assert(i >= 0); 52 | assert(i < m_); 53 | assert(vec.size() == n_); 54 | for (int64_t j = 0; j < n_; j++) { 55 | data_[i * n_ + j] += a * vec[j]; 56 | } 57 | } 58 | 59 | void Matrix::multiplyRow(const Vector& nums, int64_t ib, int64_t ie) { 60 | if (ie == -1) { 61 | ie = m_; 62 | } 63 | assert(ie <= nums.size()); 64 | for (auto i = ib; i < ie; i++) { 65 | real n = nums[i - ib]; 66 | if (n != 0) { 67 | for (auto j = 0; j < n_; j++) { 68 | at(i, j) *= n; 69 | } 70 | } 71 | } 72 | } 73 | 74 | void Matrix::divideRow(const Vector& denoms, int64_t ib, int64_t ie) { 75 | if (ie == -1) { 76 | ie = m_; 77 | } 78 | assert(ie <= denoms.size()); 79 | for (auto i = ib; i < ie; i++) { 80 | real n = denoms[i - ib]; 81 | if (n != 0) { 82 | for (auto j = 0; j < n_; j++) { 83 | at(i, j) /= n; 84 | } 85 | } 86 | } 87 | } 88 | 89 | real Matrix::l2NormRow(int64_t i) const { 90 | auto norm = 0.0; 91 | for (auto j = 0; j < n_; j++) { 92 | norm += at(i, j) * at(i, j); 93 | } 94 | if (std::isnan(norm)) { 95 | throw std::runtime_error("Encountered NaN."); 96 | } 97 | return std::sqrt(norm); 98 | } 99 | 100 | void Matrix::l2NormRow(Vector& norms) const { 101 | assert(norms.size() == m_); 102 | for (auto i = 0; i < m_; i++) { 103 | norms[i] = l2NormRow(i); 104 | } 105 | } 106 | 107 | void Matrix::save(std::ostream& out) { 108 | out.write((char*)&m_, sizeof(int64_t)); 109 | out.write((char*)&n_, sizeof(int64_t)); 110 | out.write((char*)data_.data(), m_ * n_ * sizeof(real)); 111 | } 112 | 113 | void Matrix::load(std::istream& in) { 114 | in.read((char*)&m_, sizeof(int64_t)); 115 | in.read((char*)&n_, sizeof(int64_t)); 116 | data_ = std::vector(m_ * n_); 117 | in.read((char*)data_.data(), m_ * n_ * sizeof(real)); 118 | } 119 | 120 | void Matrix::dump(std::ostream& out) const { 121 | out << m_ << " " << n_ << std::endl; 122 | for (int64_t i = 0; i < m_; i++) { 123 | for (int64_t j = 0; j < n_; j++) { 124 | if (j > 0) { 125 | out << " "; 126 | } 127 | out << at(i, j); 128 | } 129 | out << std::endl; 130 | } 131 | } 132 | 133 | } // namespace fasttext 134 | -------------------------------------------------------------------------------- /inst/include/dictionary.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "args.h" 20 | #include "real.h" 21 | 22 | namespace fasttext { 23 | 24 | typedef int32_t id_type; 25 | enum class entry_type : int8_t { word = 0, label = 1 }; 26 | 27 | struct entry { 28 | std::string word; 29 | int64_t count; 30 | entry_type type; 31 | std::vector subwords; 32 | }; 33 | 34 | class Dictionary { 35 | protected: 36 | static const int32_t MAX_VOCAB_SIZE = 30000000; 37 | static const int32_t MAX_LINE_SIZE = 1024; 38 | 39 | int32_t find(const std::string&) const; 40 | int32_t find(const std::string&, uint32_t h) const; 41 | void initTableDiscard(); 42 | void initNgrams(); 43 | void reset(std::istream&) const; 44 | void pushHash(std::vector&, int32_t) const; 45 | void addSubwords(std::vector&, const std::string&, int32_t) const; 46 | 47 | std::shared_ptr args_; 48 | std::vector word2int_; 49 | std::vector words_; 50 | 51 | std::vector pdiscard_; 52 | int32_t size_; 53 | int32_t nwords_; 54 | int32_t nlabels_; 55 | int64_t ntokens_; 56 | 57 | int64_t pruneidx_size_; 58 | std::unordered_map pruneidx_; 59 | void addWordNgrams( 60 | std::vector& line, 61 | const std::vector& hashes, 62 | int32_t n) const; 63 | 64 | public: 65 | static const std::string EOS; 66 | static const std::string BOW; 67 | static const std::string EOW; 68 | 69 | explicit Dictionary(std::shared_ptr); 70 | explicit Dictionary(std::shared_ptr, std::istream&); 71 | int32_t nwords() const; 72 | int32_t nlabels() const; 73 | int64_t ntokens() const; 74 | int32_t getId(const std::string&) const; 75 | int32_t getId(const std::string&, uint32_t h) const; 76 | entry_type getType(int32_t) const; 77 | entry_type getType(const std::string&) const; 78 | bool discard(int32_t, real) const; 79 | std::string getWord(int32_t) const; 80 | const std::vector& getSubwords(int32_t) const; 81 | const std::vector getSubwords(const std::string&) const; 82 | void getSubwords( 83 | const std::string&, 84 | std::vector&, 85 | std::vector&) const; 86 | void computeSubwords( 87 | const std::string&, 88 | std::vector&, 89 | std::vector* substrings = nullptr) const; 90 | uint32_t hash(const std::string& str) const; 91 | void add(const std::string&); 92 | bool readWord(std::istream&, std::string&) const; 93 | void readFromFile(std::istream&); 94 | std::string getLabel(int32_t) const; 95 | void save(std::ostream&) const; 96 | void load(std::istream&); 97 | std::vector getCounts(entry_type) const; 98 | int32_t getLine(std::istream&, std::vector&, std::vector&) 99 | const; 100 | int32_t getLine(std::istream&, std::vector&, std::minstd_rand&) 101 | const; 102 | void threshold(int64_t, int64_t); 103 | void prune(std::vector&); 104 | bool isPruned() { 105 | return pruneidx_size_ >= 0; 106 | } 107 | void dump(std::ostream&) const; 108 | void init(); 109 | }; 110 | 111 | } // namespace fasttext 112 | -------------------------------------------------------------------------------- /inst/include/model.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "args.h" 17 | #include "matrix.h" 18 | #include "qmatrix.h" 19 | #include "real.h" 20 | #include "vector.h" 21 | 22 | namespace fasttext { 23 | 24 | struct Node { 25 | int32_t parent; 26 | int32_t left; 27 | int32_t right; 28 | int64_t count; 29 | bool binary; 30 | }; 31 | 32 | class Model { 33 | protected: 34 | std::shared_ptr wi_; 35 | std::shared_ptr wo_; 36 | std::shared_ptr qwi_; 37 | std::shared_ptr qwo_; 38 | std::shared_ptr args_; 39 | Vector hidden_; 40 | Vector output_; 41 | Vector grad_; 42 | int32_t hsz_; 43 | int32_t osz_; 44 | real loss_; 45 | int64_t nexamples_; 46 | std::vector t_sigmoid_; 47 | std::vector t_log_; 48 | // used for negative sampling: 49 | std::vector negatives_; 50 | size_t negpos; 51 | // used for hierarchical softmax: 52 | std::vector> paths; 53 | std::vector> codes; 54 | std::vector tree; 55 | 56 | static bool comparePairs( 57 | const std::pair&, 58 | const std::pair&); 59 | 60 | int32_t getNegative(int32_t target); 61 | void initSigmoid(); 62 | void initLog(); 63 | void computeOutput(Vector&, Vector&) const; 64 | 65 | static const int32_t NEGATIVE_TABLE_SIZE = 10000000; 66 | 67 | public: 68 | Model( 69 | std::shared_ptr, 70 | std::shared_ptr, 71 | std::shared_ptr, 72 | int32_t); 73 | 74 | real binaryLogistic(int32_t, bool, real); 75 | real negativeSampling(int32_t, real); 76 | real hierarchicalSoftmax(int32_t, real); 77 | real softmax(int32_t, real); 78 | real oneVsAll(const std::vector&, real); 79 | 80 | void predict( 81 | const std::vector&, 82 | int32_t, 83 | real, 84 | std::vector>&, 85 | Vector&, 86 | Vector&) const; 87 | void predict( 88 | const std::vector&, 89 | int32_t, 90 | real, 91 | std::vector>&); 92 | void dfs( 93 | int32_t, 94 | real, 95 | int32_t, 96 | real, 97 | std::vector>&, 98 | Vector&) const; 99 | void findKBest( 100 | int32_t, 101 | real, 102 | std::vector>&, 103 | Vector&, 104 | Vector&) const; 105 | void update( 106 | const std::vector&, 107 | const std::vector&, 108 | int32_t, 109 | real); 110 | real computeLoss(const std::vector&, int32_t, real); 111 | void computeHidden(const std::vector&, Vector&) const; 112 | void computeOutputSigmoid(Vector&, Vector&) const; 113 | void computeOutputSoftmax(Vector&, Vector&) const; 114 | void computeOutputSoftmax(); 115 | 116 | void setTargetCounts(const std::vector&); 117 | void initTableNegatives(const std::vector&); 118 | void buildTree(const std::vector&); 119 | real getLoss() const; 120 | real sigmoid(real) const; 121 | real log(real) const; 122 | real std_log(real) const; 123 | 124 | std::minstd_rand rng; 125 | bool quant_; 126 | void 127 | setQuantizePointer(std::shared_ptr, std::shared_ptr, bool); 128 | 129 | static const int32_t kUnlimitedPredictions = -1; 130 | static const int32_t kAllLabelsAsTarget = -1; 131 | }; 132 | 133 | } // namespace fasttext 134 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | output: github_document 4 | --- 5 | 6 | [![tic](https://github.com/mlampros/fastText/workflows/tic/badge.svg?branch=master)](https://github.com/mlampros/fastText/actions) 7 | [![codecov.io](https://codecov.io/github/mlampros/fastText/coverage.svg?branch=master)](https://codecov.io/github/mlampros/fastText?branch=master) 8 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/fastText)](http://cran.r-project.org/package=fastText) 9 | [![Downloads](http://cranlogs.r-pkg.org/badges/grand-total/fastText?color=blue)](http://www.r-pkg.org/pkg/fastText) 10 | Buy Me A Coffee 11 | [![Dependencies](https://tinyverse.netlify.com/badge/fastText)](https://cran.r-project.org/package=fastText) 12 | 13 | 14 | ## fastText 15 |
16 | 17 | The **fastText** R package is an interface to the [fastText](https://github.com/facebookresearch/fastText) library for efficient learning of word representations and sentence classification. More details on the functionality of fastText can be found in the 18 | 19 | * [fastText_updated_version](http://mlampros.github.io/2019/04/11/fastText_updated_version/) (blog post) 20 | * [fasttext_language_identification](http://mlampros.github.io/2021/05/14/fasttext_language_identification/) (blog post) 21 | * [package documentation](https://mlampros.github.io/fastText/reference/index.html). 22 | 23 |
24 | 25 | The [official website of the fasttext algorithm](https://fasttext.cc/) includes more details regarding the supervised & unsupervised functions. The following image shows the difference between [**cbow** and **skipgram**](https://fasttext.cc/docs/en/unsupervised-tutorial.html#advanced-readers-skipgram-versus-cbow) (*models to compute word representations*) 26 | 27 |
28 | 29 | ![](./man/figures/skipgram_vs_cbow.png) 30 | 31 |
32 | 33 | Moreover, the following figure - extracted from [a survey (scientific paper) related to word embeddings](https://hal.science/hal-03148517/document) and recent advancements in Large Language Models - shows the differences between *static* and *contextualized* word embeddings 34 | 35 |
36 | 37 | ![](./man/figures/static_contextualised_word_embeddings.png) 38 | 39 |
40 | 41 | You can either install the package from CRAN using, 42 | 43 | ```R 44 | 45 | install.packages("fastText") 46 | 47 | 48 | ``` 49 | 50 |
51 | 52 | or from Github using the *install_github* function of the *remotes* package, 53 | 54 | ```R 55 | 56 | remotes::install_github('mlampros/fastText') 57 | 58 | 59 | ``` 60 |
61 | 62 | **or** directly download the fastText-zip file using the **Clone or download** button in the [repository page](https://github.com/mlampros/fastText), extract it locally (rename it to *fastText* if necessary and check that files such as DESCRIPTION, NAMESPACE etc. are present when you open the fastText folder) and then run, 63 | 64 | 65 | ```R 66 | 67 | #------------- 68 | # on a Unix OS 69 | #------------- 70 | 71 | setwd('/your_folder/fastText/') 72 | Rcpp::compileAttributes(verbose = TRUE) 73 | setwd('/your_folder/') 74 | system("R CMD build fastText") 75 | system("R CMD INSTALL fastText_1.0.1.tar.gz") 76 | 77 | 78 | #------------------ 79 | # on the Windows OS 80 | #------------------ 81 | 82 | setwd('C:/your_folder/fastText/') 83 | Rcpp::compileAttributes(verbose = TRUE) 84 | setwd('C:/your_folder/') 85 | system("R CMD build fastText") 86 | system("R CMD INSTALL fastText_1.0.1.tar.gz") 87 | 88 | ``` 89 |
90 | 91 | Use the following link to report bugs/issues (for the R package port), 92 |

93 | 94 | [https://github.com/mlampros/fastText/issues](https://github.com/mlampros/fastText/issues) 95 | 96 |
97 | 98 | ### **Citation:** 99 | 100 | If you use the **fastText** R package in your paper or research please cite both **fastText** and the **original articles / software** `https://CRAN.R-project.org/package=fastText`: 101 | 102 |
103 | 104 | ```R 105 | @Manual{, 106 | title = {{fastText}: Efficient Learning of Word Representations and 107 | Sentence Classification using R}, 108 | author = {Lampros Mouselimis}, 109 | year = {2021}, 110 | note = {R package version 1.0.3}, 111 | url = {https://CRAN.R-project.org/package=fastText}, 112 | } 113 | ``` 114 | 115 |
116 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![tic](https://github.com/mlampros/fastText/workflows/tic/badge.svg?branch=master)](https://github.com/mlampros/fastText/actions) 3 | [![codecov.io](https://codecov.io/github/mlampros/fastText/coverage.svg?branch=master)](https://codecov.io/github/mlampros/fastText?branch=master) 4 | [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/fastText)](http://cran.r-project.org/package=fastText) 5 | [![Downloads](http://cranlogs.r-pkg.org/badges/grand-total/fastText?color=blue)](http://www.r-pkg.org/pkg/fastText) 6 | Buy Me A Coffee 7 | [![Dependencies](https://tinyverse.netlify.com/badge/fastText)](https://cran.r-project.org/package=fastText) 8 | 9 | ## fastText 10 | 11 |
12 | 13 | The **fastText** R package is an interface to the 14 | [fastText](https://github.com/facebookresearch/fastText) library for 15 | efficient learning of word representations and sentence classification. 16 | More details on the functionality of fastText can be found in the 17 | 18 | - [fastText\_updated\_version](http://mlampros.github.io/2019/04/11/fastText_updated_version/) 19 | (blog post) 20 | - [fasttext\_language\_identification](http://mlampros.github.io/2021/05/14/fasttext_language_identification/) 21 | (blog post) 22 | - [package 23 | documentation](https://mlampros.github.io/fastText/reference/index.html). 24 | 25 |
26 | 27 | The [official website of the fasttext algorithm](https://fasttext.cc/) 28 | includes more details regarding the supervised & unsupervised functions. 29 | The following image shows the difference between [**cbow** and 30 | **skipgram**](https://fasttext.cc/docs/en/unsupervised-tutorial.html#advanced-readers-skipgram-versus-cbow) 31 | (*models to compute word representations*) 32 | 33 |
34 | 35 | ![](./man/figures/skipgram_vs_cbow.png) 36 | 37 |
38 | 39 | Moreover, the following figure - extracted from [a survey (scientific 40 | paper) related to word 41 | embeddings](https://hal.science/hal-03148517/document) and recent 42 | advancements in Large Language Models - shows the differences between 43 | *static* and *contextualized* word embeddings 44 | 45 |
46 | 47 | ![](./man/figures/static_contextualised_word_embeddings.png) 48 | 49 |
50 | 51 | You can either install the package from CRAN using, 52 | 53 | ``` r 54 | install.packages("fastText") 55 | 56 | ``` 57 | 58 |
59 | 60 | or from Github using the *install\_github* function of the *remotes* 61 | package, 62 | 63 | ``` r 64 | remotes::install_github('mlampros/fastText') 65 | 66 | ``` 67 | 68 |
69 | 70 | **or** directly download the fastText-zip file using the **Clone or 71 | download** button in the [repository 72 | page](https://github.com/mlampros/fastText), extract it locally (rename 73 | it to *fastText* if necessary and check that files such as DESCRIPTION, 74 | NAMESPACE etc. are present when you open the fastText folder) and then 75 | run, 76 | 77 | ``` r 78 | #------------- 79 | # on a Unix OS 80 | #------------- 81 | 82 | setwd('/your_folder/fastText/') 83 | Rcpp::compileAttributes(verbose = TRUE) 84 | setwd('/your_folder/') 85 | system("R CMD build fastText") 86 | system("R CMD INSTALL fastText_1.0.1.tar.gz") 87 | 88 | 89 | #------------------ 90 | # on the Windows OS 91 | #------------------ 92 | 93 | setwd('C:/your_folder/fastText/') 94 | Rcpp::compileAttributes(verbose = TRUE) 95 | setwd('C:/your_folder/') 96 | system("R CMD build fastText") 97 | system("R CMD INSTALL fastText_1.0.1.tar.gz") 98 | ``` 99 | 100 |
101 | 102 | Use the following link to report bugs/issues (for the R package port), 103 |

104 | 105 | 106 | 107 |
108 | 109 | ### **Citation:** 110 | 111 | If you use the **fastText** R package in your paper or research please 112 | cite both **fastText** and the **original articles / software** 113 | `https://CRAN.R-project.org/package=fastText`: 114 | 115 |
116 | 117 | ``` r 118 | @Manual{, 119 | title = {{fastText}: Efficient Learning of Word Representations and 120 | Sentence Classification using R}, 121 | author = {Lampros Mouselimis}, 122 | year = {2021}, 123 | note = {R package version 1.0.3}, 124 | url = {https://CRAN.R-project.org/package=fastText}, 125 | } 126 | ``` 127 | 128 |
129 | -------------------------------------------------------------------------------- /tests/testthat/cooking_supervised.txt: -------------------------------------------------------------------------------- 1 | __label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe? 2 | __label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments 3 | __label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove? 4 | __label__restaurant Michelin Three Star Restaurant; but if the chef is not there 5 | __label__knife-skills __label__dicing Without knife skills, how can I quickly and accurately dice vegetables? 6 | __label__storage-method __label__equipment __label__bread What's the purpose of a bread box? 7 | __label__baking __label__food-safety __label__substitutions __label__peanuts how to seperate peanut oil from roasted peanuts at home? 8 | __label__chocolate American equivalent for British chocolate terms 9 | __label__baking __label__oven __label__convection Fan bake vs bake 10 | __label__sauce __label__storage-lifetime __label__acidity __label__mayonnaise Regulation and balancing of readymade packed mayonnaise and other sauces 11 | __label__tea What kind of tea do you boil for 45minutes? 12 | __label__baking __label__baking-powder __label__baking-soda __label__leavening How long can batter sit before chemical leaveners lose their power? 13 | __label__food-safety __label__soup Can I RE-freeze chicken soup after it has thawed? 14 | __label__sous-vide __label__vacuum Ziploc vacuumed bags expand in sous vide 15 | __label__baking __label__substitutions __label__syrup What can I use instead of corn syrup? 16 | __label__vegan __label__almonds __label__almond-milk Does soaking almonds have the same effect as blanching and removing the skins when making almond milk? 17 | __label__baking __label__cake __label__soda Cake sinks in the middle when baking. Only happens when I make a Coca-Cola Cake 18 | __label__baking Which plastic wrap is okay for oven use? 19 | __label__tea Can I dissolve sugar first before steeping tea? 20 | __label__food-safety __label__salmon Is it safe to eat food that was heated in plastic wrap to the point the plastic wrap flamed? 21 | __label__flavor __label__spices __label__chemistry Flavor and Chemical Composition of Thyme 22 | __label__equipment What can I use as a manual hard cheese slicer? 23 | __label__flour __label__milling Are stone or metal grinding wheels better for flour? 24 | __label__beans Do fava beans need to cook longer than other kinds of beans? 25 | __label__baking __label__bread __label__kneading Kneading Bread After Rising 26 | __label__beef __label__roast __label__gravy __label__roast-beef Extraordinary Beef Gravy? 27 | __label__baking __label__bread __label__crust How to heat up already baked french bread in oven to get a crispy crust 28 | __label__chocolate Is there a difference in appearance between semi and unsweetened chocolate? 29 | __label__food-science __label__marinade __label__brining If salt dehydrates the meat, then why would brining make it more juicy as a whole? 30 | __label__cookies __label__texture __label__american-cuisine How long after baking do American chewy cookies get their normal texture? 31 | __label__fruit __label__alcohol __label__liqueur Is cooking with fruit liqueur comparable to cooking with fruit juice? 32 | __label__bread __label__cheese __label__jelly __label__brie Suggestions for Brie + Bread + Preserves 33 | __label__soup __label__texture __label__standards What is the correct consistency of a cream soup? 34 | __label__food-science __label__tea Making tea - milk first or tea first 35 | __label__food-safety __label__salt Sea Salt and Mercury 36 | __label__cinnamon Cinnamon Thickening 37 | __label__sauce __label__flavor __label__syrup Basic carrier sauce/syrup for different sweet flavors? 38 | __label__roasting __label__eggplant What is the 'cleanest' way to roast eggplants indoor? 39 | __label__rice Cooking and storing rice for a whole week 40 | __label__dehydrating dehydrating puree food 41 | __label__soup __label__canning __label__food-processing In industrially produced soup, how does each can contain equal parts of all ingredients? 42 | __label__flavor __label__microwave __label__popcorn How Is Microwave Popcorn Flavoured? 43 | __label__culinary-uses __label__vegetables __label__eggplant What can I do with under-ripe eggplant? 44 | __label__water __label__cocktails __label__whiskey Whiskey and Water 45 | __label__meat __label__ham Wet Cooked Ham Slices 46 | __label__onions __label__deep-frying __label__restaurant-mimicry Beer Battered Onion Rings -- what makes them look shiny? 47 | __label__wok Determining a wok's material 48 | __label__food-safety __label__oil Is cloudy-looking used peanut oil safe? 49 | __label__oil __label__cleaning __label__coconut __label__olive-oil __label__maintenance To finely spray a thin layer of warm liquid Coconut Oil? 50 | __label__candy __label__fudge Why Do We 'Simmer' Fudge Instead of 'Boiling' it? 51 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 11 | #endif 12 | 13 | // printUsage 14 | void printUsage(bool verbose); 15 | RcppExport SEXP _fastText_printUsage(SEXP verboseSEXP) { 16 | BEGIN_RCPP 17 | Rcpp::RNGScope rcpp_rngScope_gen; 18 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 19 | printUsage(verbose); 20 | return R_NilValue; 21 | END_RCPP 22 | } 23 | // printQuantizeUsage 24 | void printQuantizeUsage(bool verbose); 25 | RcppExport SEXP _fastText_printQuantizeUsage(SEXP verboseSEXP) { 26 | BEGIN_RCPP 27 | Rcpp::RNGScope rcpp_rngScope_gen; 28 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 29 | printQuantizeUsage(verbose); 30 | return R_NilValue; 31 | END_RCPP 32 | } 33 | // printTestUsage 34 | void printTestUsage(bool verbose); 35 | RcppExport SEXP _fastText_printTestUsage(SEXP verboseSEXP) { 36 | BEGIN_RCPP 37 | Rcpp::RNGScope rcpp_rngScope_gen; 38 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 39 | printTestUsage(verbose); 40 | return R_NilValue; 41 | END_RCPP 42 | } 43 | // printPredictUsage 44 | void printPredictUsage(bool verbose); 45 | RcppExport SEXP _fastText_printPredictUsage(SEXP verboseSEXP) { 46 | BEGIN_RCPP 47 | Rcpp::RNGScope rcpp_rngScope_gen; 48 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 49 | printPredictUsage(verbose); 50 | return R_NilValue; 51 | END_RCPP 52 | } 53 | // printTestLabelUsage 54 | void printTestLabelUsage(bool verbose); 55 | RcppExport SEXP _fastText_printTestLabelUsage(SEXP verboseSEXP) { 56 | BEGIN_RCPP 57 | Rcpp::RNGScope rcpp_rngScope_gen; 58 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 59 | printTestLabelUsage(verbose); 60 | return R_NilValue; 61 | END_RCPP 62 | } 63 | // printPrintWordVectorsUsage 64 | void printPrintWordVectorsUsage(bool verbose); 65 | RcppExport SEXP _fastText_printPrintWordVectorsUsage(SEXP verboseSEXP) { 66 | BEGIN_RCPP 67 | Rcpp::RNGScope rcpp_rngScope_gen; 68 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 69 | printPrintWordVectorsUsage(verbose); 70 | return R_NilValue; 71 | END_RCPP 72 | } 73 | // printPrintSentenceVectorsUsage 74 | void printPrintSentenceVectorsUsage(bool verbose); 75 | RcppExport SEXP _fastText_printPrintSentenceVectorsUsage(SEXP verboseSEXP) { 76 | BEGIN_RCPP 77 | Rcpp::RNGScope rcpp_rngScope_gen; 78 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 79 | printPrintSentenceVectorsUsage(verbose); 80 | return R_NilValue; 81 | END_RCPP 82 | } 83 | // printPrintNgramsUsage 84 | void printPrintNgramsUsage(bool verbose); 85 | RcppExport SEXP _fastText_printPrintNgramsUsage(SEXP verboseSEXP) { 86 | BEGIN_RCPP 87 | Rcpp::RNGScope rcpp_rngScope_gen; 88 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 89 | printPrintNgramsUsage(verbose); 90 | return R_NilValue; 91 | END_RCPP 92 | } 93 | // printNNUsage 94 | void printNNUsage(bool verbose); 95 | RcppExport SEXP _fastText_printNNUsage(SEXP verboseSEXP) { 96 | BEGIN_RCPP 97 | Rcpp::RNGScope rcpp_rngScope_gen; 98 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 99 | printNNUsage(verbose); 100 | return R_NilValue; 101 | END_RCPP 102 | } 103 | // printAnalogiesUsage 104 | void printAnalogiesUsage(bool verbose); 105 | RcppExport SEXP _fastText_printAnalogiesUsage(SEXP verboseSEXP) { 106 | BEGIN_RCPP 107 | Rcpp::RNGScope rcpp_rngScope_gen; 108 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 109 | printAnalogiesUsage(verbose); 110 | return R_NilValue; 111 | END_RCPP 112 | } 113 | // printDumpUsage 114 | void printDumpUsage(bool verbose); 115 | RcppExport SEXP _fastText_printDumpUsage(SEXP verboseSEXP) { 116 | BEGIN_RCPP 117 | Rcpp::RNGScope rcpp_rngScope_gen; 118 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 119 | printDumpUsage(verbose); 120 | return R_NilValue; 121 | END_RCPP 122 | } 123 | // give_args_fasttext 124 | void give_args_fasttext(std::vector args, std::string pth, int MilliSecs, std::string pth_in, std::string queryWord, bool remove_previous_file); 125 | RcppExport SEXP _fastText_give_args_fasttext(SEXP argsSEXP, SEXP pthSEXP, SEXP MilliSecsSEXP, SEXP pth_inSEXP, SEXP queryWordSEXP, SEXP remove_previous_fileSEXP) { 126 | BEGIN_RCPP 127 | Rcpp::RNGScope rcpp_rngScope_gen; 128 | Rcpp::traits::input_parameter< std::vector >::type args(argsSEXP); 129 | Rcpp::traits::input_parameter< std::string >::type pth(pthSEXP); 130 | Rcpp::traits::input_parameter< int >::type MilliSecs(MilliSecsSEXP); 131 | Rcpp::traits::input_parameter< std::string >::type pth_in(pth_inSEXP); 132 | Rcpp::traits::input_parameter< std::string >::type queryWord(queryWordSEXP); 133 | Rcpp::traits::input_parameter< bool >::type remove_previous_file(remove_previous_fileSEXP); 134 | give_args_fasttext(args, pth, MilliSecs, pth_in, queryWord, remove_previous_file); 135 | return R_NilValue; 136 | END_RCPP 137 | } 138 | -------------------------------------------------------------------------------- /.github/workflows/tic.yml: -------------------------------------------------------------------------------- 1 | ## tic GitHub Actions template: linux-macos-windows-deploy 2 | ## revision date: 2020-12-11 3 | on: 4 | workflow_dispatch: 5 | push: 6 | pull_request: 7 | # for now, CRON jobs only run on the default branch of the repo (i.e. usually on master) 8 | schedule: 9 | # * is a special character in YAML so you have to quote this string 10 | - cron: "0 4 * * *" 11 | 12 | name: tic 13 | 14 | jobs: 15 | all: 16 | runs-on: ${{ matrix.config.os }} 17 | 18 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | config: 24 | # use a different tic template type if you do not want to build on all listed platforms 25 | - { os: windows-latest, r: "release" } 26 | - { os: macOS-latest, r: "release", pkgdown: "true", latex: "true" } 27 | - { os: ubuntu-latest, r: "devel" } 28 | - { os: ubuntu-latest, r: "release" } 29 | 30 | env: 31 | # otherwise remotes::fun() errors cause the build to fail. Example: Unavailability of binaries 32 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 33 | CRAN: ${{ matrix.config.cran }} 34 | # make sure to run `tic::use_ghactions_deploy()` to set up deployment 35 | TIC_DEPLOY_KEY: ${{ secrets.TIC_DEPLOY_KEY }} 36 | # prevent rgl issues because no X11 display is available 37 | RGL_USE_NULL: true 38 | # if you use bookdown or blogdown, replace "PKGDOWN" by the respective 39 | # capitalized term. This also might need to be done in tic.R 40 | BUILD_PKGDOWN: ${{ matrix.config.pkgdown }} 41 | # macOS >= 10.15.4 linking 42 | SDKROOT: /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk 43 | # use GITHUB_TOKEN from GitHub to workaround rate limits in {remotes} 44 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 45 | 46 | steps: 47 | - uses: actions/checkout@v3 48 | 49 | - uses: r-lib/actions/setup-r@v2 50 | with: 51 | r-version: ${{ matrix.config.r }} 52 | Ncpus: 4 53 | 54 | # LaTeX. Installation time: 55 | # Linux: ~ 1 min 56 | # macOS: ~ 1 min 30s 57 | # Windows: never finishes 58 | - uses: r-lib/actions/setup-tinytex@v2 59 | if: matrix.config.latex == 'true' 60 | 61 | - uses: r-lib/actions/setup-pandoc@v2 62 | 63 | # set date/week for use in cache creation 64 | # https://github.community/t5/GitHub-Actions/How-to-set-and-access-a-Workflow-variable/m-p/42970 65 | # - cache R packages daily 66 | - name: "[Cache] Prepare daily timestamp for cache" 67 | if: runner.os != 'Windows' 68 | id: date 69 | run: echo "::set-output name=date::$(date '+%d-%m')" 70 | 71 | - name: "[Cache] Cache R packages" 72 | if: runner.os != 'Windows' 73 | uses: pat-s/always-upload-cache@v2.1.3 74 | with: 75 | path: ${{ env.R_LIBS_USER }} 76 | key: ${{ runner.os }}-r-${{ matrix.config.r }}-${{steps.date.outputs.date}} 77 | restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-${{steps.date.outputs.date}} 78 | 79 | # for some strange Windows reason this step and the next one need to be decoupled 80 | - name: "[Stage] Prepare" 81 | run: | 82 | Rscript -e "if (!requireNamespace('remotes')) install.packages('remotes', type = 'source')" 83 | Rscript -e "if (getRversion() < '3.2' && !requireNamespace('curl')) install.packages('curl', type = 'source')" 84 | 85 | - name: "[Stage] [Linux] Install curl and libgit2" 86 | if: runner.os == 'Linux' 87 | run: sudo apt install libcurl4-openssl-dev libgit2-dev 88 | 89 | - name: "[Stage] [macOS] Install libgit2" 90 | if: runner.os == 'macOS' 91 | run: brew install libgit2 92 | 93 | - name: "[Stage] [macOS] Install system libs for pkgdown" 94 | if: runner.os == 'macOS' && matrix.config.pkgdown != '' 95 | run: brew install harfbuzz fribidi 96 | 97 | - name: "[Stage] [Linux] Install system libs for pkgdown" 98 | if: runner.os == 'Linux' && matrix.config.pkgdown != '' 99 | run: sudo apt install libharfbuzz-dev libfribidi-dev 100 | 101 | - name: "[Stage] Install" 102 | if: matrix.config.os != 'macOS-latest' || matrix.config.r != 'devel' 103 | run: Rscript -e "remotes::install_github('ropensci/tic')" -e "print(tic::dsl_load())" -e "tic::prepare_all_stages()" -e "tic::before_install()" -e "tic::install()" 104 | 105 | # macOS devel needs its own stage because we need to work with an option to suppress the usage of binaries 106 | - name: "[Stage] Prepare & Install (macOS-devel)" 107 | if: matrix.config.os == 'macOS-latest' && matrix.config.r == 'devel' 108 | run: | 109 | echo -e 'options(Ncpus = 4, pkgType = "source", repos = structure(c(CRAN = "https://cloud.r-project.org/")))' > $HOME/.Rprofile 110 | Rscript -e "remotes::install_github('ropensci/tic')" -e "print(tic::dsl_load())" -e "tic::prepare_all_stages()" -e "tic::before_install()" -e "tic::install()" 111 | 112 | - name: "[Stage] Script" 113 | run: Rscript -e 'tic::script()' 114 | 115 | - name: "[Stage] After Success" 116 | if: matrix.config.os == 'macOS-latest' && matrix.config.r == 'release' 117 | run: Rscript -e "tic::after_success()" 118 | 119 | - name: "[Stage] Upload R CMD check artifacts" 120 | if: failure() 121 | uses: actions/upload-artifact@v2.2.1 122 | with: 123 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 124 | path: check 125 | - name: "[Stage] Before Deploy" 126 | run: | 127 | Rscript -e "tic::before_deploy()" 128 | 129 | - name: "[Stage] Deploy" 130 | run: Rscript -e "tic::deploy()" 131 | 132 | - name: "[Stage] After Deploy" 133 | run: Rscript -e "tic::after_deploy()" 134 | -------------------------------------------------------------------------------- /inst/include/fasttext.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #pragma once 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "args.h" 22 | #include "dictionary.h" 23 | #include "matrix.h" 24 | #include "meter.h" 25 | #include "model.h" 26 | #include "qmatrix.h" 27 | #include "real.h" 28 | #include "utils.h" 29 | #include "vector.h" 30 | 31 | 32 | namespace fasttext { 33 | 34 | class FastText { 35 | protected: 36 | std::shared_ptr args_; 37 | std::shared_ptr dict_; 38 | 39 | std::shared_ptr input_; 40 | std::shared_ptr output_; 41 | 42 | std::shared_ptr qinput_; 43 | std::shared_ptr qoutput_; 44 | 45 | std::shared_ptr model_; 46 | 47 | std::atomic tokenCount_{}; 48 | std::atomic loss_{}; 49 | 50 | std::chrono::steady_clock::time_point start_; 51 | void signModel(std::ostream&); 52 | bool checkModel(std::istream&); 53 | void startThreads(std::string, int); 54 | void addInputVector(Vector&, int32_t) const; 55 | void trainThread(int32_t); 56 | std::vector> getNN( 57 | const Matrix& wordVectors, 58 | const Vector& queryVec, 59 | int32_t k, 60 | const std::set& banSet); 61 | void lazyComputeWordVectors(); 62 | void printInfo(real, real, std::ostream&, std::ofstream&, std::string); 63 | 64 | bool quant_; 65 | int32_t version; 66 | std::unique_ptr wordVectors_; 67 | 68 | public: 69 | FastText(); 70 | 71 | int32_t getWordId(const std::string& word) const; 72 | 73 | int32_t getSubwordId(const std::string& subword) const; 74 | 75 | void getWordVector(Vector& vec, const std::string& word) const; 76 | 77 | void getSubwordVector(Vector& vec, const std::string& subword) const; 78 | 79 | inline void getInputVector(Vector& vec, int32_t ind) { 80 | vec.zero(); 81 | addInputVector(vec, ind); 82 | } 83 | 84 | const Args getArgs() const; 85 | 86 | std::shared_ptr getDictionary() const; 87 | 88 | std::shared_ptr getInputMatrix() const; 89 | 90 | std::shared_ptr getOutputMatrix() const; 91 | 92 | void saveVectors(const std::string& filename); 93 | 94 | void saveModel(const std::string& filename); 95 | 96 | void saveOutput(const std::string& filename); 97 | 98 | void loadModel(std::istream& in); 99 | 100 | void loadModel(const std::string& filename); 101 | 102 | void getSentenceVector(std::istream& in, Vector& vec); 103 | 104 | void quantize(const Args& qargs); 105 | 106 | std::tuple 107 | test(std::istream& in, int32_t k, real threshold = 0.0); 108 | 109 | void test(std::istream& in, int32_t k, real threshold, Meter& meter) const; 110 | 111 | void predict( 112 | int32_t k, 113 | const std::vector& words, 114 | std::vector>& predictions, 115 | real threshold = 0.0) const; 116 | 117 | bool predictLine( 118 | std::istream& in, 119 | std::vector>& predictions, 120 | int32_t k, 121 | real threshold) const; 122 | 123 | std::vector> getNgramVectors( 124 | const std::string& word) const; 125 | 126 | std::vector> getNN( 127 | const std::string& word, 128 | int32_t k); 129 | 130 | std::vector> getAnalogies( 131 | int32_t k, 132 | const std::string& wordA, 133 | const std::string& wordB, 134 | const std::string& wordC); 135 | 136 | void train(const Args& args, std::string pth, int MilliSecs); 137 | 138 | void loadVectors(const std::string& filename); 139 | 140 | int getDimension() const; 141 | 142 | bool isQuant() const; 143 | 144 | // FASTTEXT_DEPRECATED( 145 | // "getVector is being deprecated and replaced by getWordVector.") 146 | void getVector(Vector& vec, const std::string& word) const; 147 | 148 | // FASTTEXT_DEPRECATED( 149 | // "ngramVectors is being deprecated and replaced by getNgramVectors.") 150 | void ngramVectors(std::string word); 151 | 152 | // // FASTTEXT_DEPRECATED( 153 | // // "analogies is being deprecated and replaced by getAnalogies.") 154 | // void analogies(int32_t k); 155 | 156 | // FASTTEXT_DEPRECATED("supervised is being deprecated.") 157 | void supervised( 158 | Model& model, 159 | real lr, 160 | const std::vector& line, 161 | const std::vector& labels); 162 | 163 | // FASTTEXT_DEPRECATED("cbow is being deprecated.") 164 | void cbow(Model& model, real lr, const std::vector& line); 165 | 166 | // FASTTEXT_DEPRECATED("skipgram is being deprecated.") 167 | void skipgram(Model& model, real lr, const std::vector& line); 168 | 169 | // FASTTEXT_DEPRECATED("selectEmbeddings is being deprecated.") 170 | std::vector selectEmbeddings(int32_t cutoff) const; 171 | 172 | // FASTTEXT_DEPRECATED( 173 | // "saveVectors is being deprecated, please use the other signature.") 174 | void saveVectors(); 175 | 176 | // FASTTEXT_DEPRECATED( 177 | // "saveOutput is being deprecated, please use the other signature.") 178 | void saveOutput(); 179 | 180 | // FASTTEXT_DEPRECATED( 181 | // "saveModel is being deprecated, please use the other signature.") 182 | void saveModel(); 183 | 184 | // FASTTEXT_DEPRECATED("precomputeWordVectors is being deprecated.") 185 | void precomputeWordVectors(Matrix& wordVectors); 186 | 187 | // FASTTEXT_DEPRECATED("findNN is being deprecated and replaced by getNN.") 188 | void findNN( 189 | const Matrix& wordVectors, 190 | const Vector& query, 191 | int32_t k, 192 | const std::set& banSet, 193 | std::vector>& results); 194 | }; 195 | } // namespace fasttext 196 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #' Print Usage Information for all parameters 5 | #' 6 | #' @param verbose if TRUE then information will be printed in the console 7 | #' 8 | #' @return It does not return a value but only prints the available parameters of the 'printUsage' function in the R session 9 | #' @export 10 | #' @examples 11 | #' 12 | #' library(fastText) 13 | #' 14 | #' printUsage() 15 | #' 16 | printUsage <- function(verbose = TRUE) { 17 | invisible(.Call(`_fastText_printUsage`, verbose)) 18 | } 19 | 20 | #' Print Usage Information when the command equals to 'quantize' 21 | #' 22 | #' @param verbose if TRUE then information will be printed in the console 23 | #' 24 | #' @return It does not return a value but only prints the available parameters of the 'printQuantizeUsage' function in the R session 25 | #' @export 26 | #' @examples 27 | #' 28 | #' library(fastText) 29 | #' 30 | #' printQuantizeUsage() 31 | #' 32 | printQuantizeUsage <- function(verbose = TRUE) { 33 | invisible(.Call(`_fastText_printQuantizeUsage`, verbose)) 34 | } 35 | 36 | #' Print Usage Information when the command equals to 'test' 37 | #' 38 | #' @param verbose if TRUE then information will be printed in the console 39 | #' 40 | #' @return It does not return a value but only prints the available parameters of the 'printTestUsage' function in the R session 41 | #' @export 42 | #' @examples 43 | #' 44 | #' library(fastText) 45 | #' 46 | #' printTestUsage() 47 | #' 48 | printTestUsage <- function(verbose = TRUE) { 49 | invisible(.Call(`_fastText_printTestUsage`, verbose)) 50 | } 51 | 52 | #' Print Usage Information when the command equals to 'predict' or 'predict-prob' 53 | #' 54 | #' @param verbose if TRUE then information will be printed in the console 55 | #' 56 | #' @return It does not return a value but only prints the available parameters of the 'printPredictUsage' function in the R session 57 | #' @export 58 | #' @examples 59 | #' 60 | #' library(fastText) 61 | #' 62 | #' printPredictUsage() 63 | #' 64 | printPredictUsage <- function(verbose = TRUE) { 65 | invisible(.Call(`_fastText_printPredictUsage`, verbose)) 66 | } 67 | 68 | #' Print Usage Information when the command equals to 'test-label' 69 | #' 70 | #' @param verbose if TRUE then information will be printed in the console 71 | #' 72 | #' @return It does not return a value but only prints the available parameters of the 'printTestLabelUsage' function in the R session 73 | #' @export 74 | #' @examples 75 | #' 76 | #' library(fastText) 77 | #' 78 | #' printTestLabelUsage() 79 | #' 80 | printTestLabelUsage <- function(verbose = TRUE) { 81 | invisible(.Call(`_fastText_printTestLabelUsage`, verbose)) 82 | } 83 | 84 | #' Print Usage Information when the command equals to 'print-word-vectors' 85 | #' 86 | #' @param verbose if TRUE then information will be printed in the console 87 | #' 88 | #' @return It does not return a value but only prints the available parameters of the 'printPrintWordVectorsUsage' function in the R session 89 | #' @export 90 | #' @examples 91 | #' 92 | #' library(fastText) 93 | #' 94 | #' printPrintWordVectorsUsage() 95 | #' 96 | printPrintWordVectorsUsage <- function(verbose = TRUE) { 97 | invisible(.Call(`_fastText_printPrintWordVectorsUsage`, verbose)) 98 | } 99 | 100 | #' Print Usage Information when the command equals to 'print-sentence-vectors' 101 | #' 102 | #' @param verbose if TRUE then information will be printed in the console 103 | #' 104 | #' @return It does not return a value but only prints the available parameters of the 'printPrintSentenceVectorsUsage' function in the R session 105 | #' @export 106 | #' @examples 107 | #' 108 | #' library(fastText) 109 | #' 110 | #' printPrintSentenceVectorsUsage() 111 | #' 112 | printPrintSentenceVectorsUsage <- function(verbose = TRUE) { 113 | invisible(.Call(`_fastText_printPrintSentenceVectorsUsage`, verbose)) 114 | } 115 | 116 | #' Print Usage Information when the command equals to 'print-ngrams' 117 | #' 118 | #' @param verbose if TRUE then information will be printed in the console 119 | #' 120 | #' @return It does not return a value but only prints the available parameters of the 'printPrintNgramsUsage' function in the R session 121 | #' @export 122 | #' @examples 123 | #' 124 | #' library(fastText) 125 | #' 126 | #' printPrintNgramsUsage() 127 | #' 128 | printPrintNgramsUsage <- function(verbose = TRUE) { 129 | invisible(.Call(`_fastText_printPrintNgramsUsage`, verbose)) 130 | } 131 | 132 | #' Print Usage Information when the command equals to 'nn' 133 | #' 134 | #' @param verbose if TRUE then information will be printed in the console 135 | #' 136 | #' @return It does not return a value but only prints the available parameters of the 'printNNUsage' function in the R session 137 | #' @export 138 | #' @examples 139 | #' 140 | #' library(fastText) 141 | #' 142 | #' printNNUsage() 143 | #' 144 | printNNUsage <- function(verbose = TRUE) { 145 | invisible(.Call(`_fastText_printNNUsage`, verbose)) 146 | } 147 | 148 | #' Print Usage Information when the command equals to 'analogies' 149 | #' 150 | #' @param verbose if TRUE then information will be printed in the console 151 | #' 152 | #' @return It does not return a value but only prints the available parameters of the 'printAnalogiesUsage' function in the R session 153 | #' @export 154 | #' @examples 155 | #' 156 | #' library(fastText) 157 | #' 158 | #' printAnalogiesUsage() 159 | #' 160 | printAnalogiesUsage <- function(verbose = TRUE) { 161 | invisible(.Call(`_fastText_printAnalogiesUsage`, verbose)) 162 | } 163 | 164 | #' Print Usage Information when the command equals to 'dump' 165 | #' 166 | #' @param verbose if TRUE then information will be printed in the console 167 | #' 168 | #' @return It does not return a value but only prints the available parameters of the 'printDumpUsage' function in the R session 169 | #' @export 170 | #' @examples 171 | #' 172 | #' library(fastText) 173 | #' 174 | #' printDumpUsage() 175 | #' 176 | printDumpUsage <- function(verbose = TRUE) { 177 | invisible(.Call(`_fastText_printDumpUsage`, verbose)) 178 | } 179 | 180 | #' The Rcpp function which is used in the 'fasttext_interface' R function 181 | #' 182 | #' @param args the arguments that will be passed to the function in form of a character vector 183 | #' @param pth a character string specifying the path where the process-logs (or output in generally) should be saved 184 | #' @param MilliSecs an integer specifying the delay in milliseconds when printing the results to the specified path_output 185 | #' @param pth_in a character string specifying the path to the input data file 186 | #' @param queryWord either an empty string or the queryword that should be passed to the function 187 | #' @param remove_previous_file a boolean. If TRUE, in case that the path_output is not an empty string (""), then an existing file with the same output name will be removed 188 | #' @return It does not return a value but only saves the results to a file 189 | #' 190 | #' @keywords internal 191 | #' 192 | give_args_fasttext <- function(args, pth = "", MilliSecs = 100L, pth_in = "", queryWord = "", remove_previous_file = TRUE) { 193 | invisible(.Call(`_fastText_give_args_fasttext`, args, pth, MilliSecs, pth_in, queryWord, remove_previous_file)) 194 | } 195 | 196 | -------------------------------------------------------------------------------- /src/productquantizer.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "productquantizer.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace fasttext { 18 | 19 | real distL2(const real* x, const real* y, int32_t d) { 20 | real dist = 0; 21 | for (auto i = 0; i < d; i++) { 22 | auto tmp = x[i] - y[i]; 23 | dist += tmp * tmp; 24 | } 25 | return dist; 26 | } 27 | 28 | ProductQuantizer::ProductQuantizer(int32_t dim, int32_t dsub) 29 | : dim_(dim), 30 | nsubq_(dim / dsub), 31 | dsub_(dsub), 32 | centroids_(dim * ksub_), 33 | rng(seed_) { 34 | lastdsub_ = dim_ % dsub; 35 | if (lastdsub_ == 0) { 36 | lastdsub_ = dsub_; 37 | } else { 38 | nsubq_++; 39 | } 40 | } 41 | 42 | const real* ProductQuantizer::get_centroids(int32_t m, uint8_t i) const { 43 | if (m == nsubq_ - 1) { 44 | return ¢roids_[m * ksub_ * dsub_ + i * lastdsub_]; 45 | } 46 | return ¢roids_[(m * ksub_ + i) * dsub_]; 47 | } 48 | 49 | real* ProductQuantizer::get_centroids(int32_t m, uint8_t i) { 50 | if (m == nsubq_ - 1) { 51 | return ¢roids_[m * ksub_ * dsub_ + i * lastdsub_]; 52 | } 53 | return ¢roids_[(m * ksub_ + i) * dsub_]; 54 | } 55 | 56 | real ProductQuantizer::assign_centroid( 57 | const real* x, 58 | const real* c0, 59 | uint8_t* code, 60 | int32_t d) const { 61 | const real* c = c0; 62 | real dis = distL2(x, c, d); 63 | code[0] = 0; 64 | for (auto j = 1; j < ksub_; j++) { 65 | c += d; 66 | real disij = distL2(x, c, d); 67 | if (disij < dis) { 68 | code[0] = (uint8_t)j; 69 | dis = disij; 70 | } 71 | } 72 | return dis; 73 | } 74 | 75 | void ProductQuantizer::Estep( 76 | const real* x, 77 | const real* centroids, 78 | uint8_t* codes, 79 | int32_t d, 80 | int32_t n) const { 81 | for (auto i = 0; i < n; i++) { 82 | assign_centroid(x + i * d, centroids, codes + i, d); 83 | } 84 | } 85 | 86 | void ProductQuantizer::MStep( 87 | const real* x0, 88 | real* centroids, 89 | const uint8_t* codes, 90 | int32_t d, 91 | int32_t n) { 92 | std::vector nelts(ksub_, 0); 93 | memset(centroids, 0, sizeof(real) * d * ksub_); 94 | const real* x = x0; 95 | for (auto i = 0; i < n; i++) { 96 | auto k = codes[i]; 97 | real* c = centroids + k * d; 98 | for (auto j = 0; j < d; j++) { 99 | c[j] += x[j]; 100 | } 101 | nelts[k]++; 102 | x += d; 103 | } 104 | 105 | real* c = centroids; 106 | for (auto k = 0; k < ksub_; k++) { 107 | real z = (real)nelts[k]; 108 | if (z != 0) { 109 | for (auto j = 0; j < d; j++) { 110 | c[j] /= z; 111 | } 112 | } 113 | c += d; 114 | } 115 | 116 | std::uniform_real_distribution<> runiform(0, 1); 117 | for (auto k = 0; k < ksub_; k++) { 118 | if (nelts[k] == 0) { 119 | int32_t m = 0; 120 | while (runiform(rng) * (n - ksub_) >= nelts[m] - 1) { 121 | m = (m + 1) % ksub_; 122 | } 123 | memcpy(centroids + k * d, centroids + m * d, sizeof(real) * d); 124 | for (auto j = 0; j < d; j++) { 125 | int32_t sign = (j % 2) * 2 - 1; 126 | centroids[k * d + j] += sign * eps_; 127 | centroids[m * d + j] -= sign * eps_; 128 | } 129 | nelts[k] = nelts[m] / 2; 130 | nelts[m] -= nelts[k]; 131 | } 132 | } 133 | } 134 | 135 | void ProductQuantizer::kmeans(const real* x, real* c, int32_t n, int32_t d) { 136 | std::vector perm(n, 0); 137 | std::iota(perm.begin(), perm.end(), 0); 138 | std::shuffle(perm.begin(), perm.end(), rng); 139 | for (auto i = 0; i < ksub_; i++) { 140 | memcpy(&c[i * d], x + perm[i] * d, d * sizeof(real)); 141 | } 142 | auto codes = std::vector(n); 143 | for (auto i = 0; i < niter_; i++) { 144 | Estep(x, c, codes.data(), d, n); 145 | MStep(x, c, codes.data(), d, n); 146 | } 147 | } 148 | 149 | void ProductQuantizer::train(int32_t n, const real* x) { 150 | if (n < ksub_) { 151 | throw std::invalid_argument( 152 | "Matrix too small for quantization, must have at least " + 153 | std::to_string(ksub_) + " rows"); 154 | } 155 | std::vector perm(n, 0); 156 | std::iota(perm.begin(), perm.end(), 0); 157 | auto d = dsub_; 158 | auto np = std::min(n, max_points_); 159 | auto xslice = std::vector(np * dsub_); 160 | for (auto m = 0; m < nsubq_; m++) { 161 | if (m == nsubq_ - 1) { 162 | d = lastdsub_; 163 | } 164 | if (np != n) { 165 | std::shuffle(perm.begin(), perm.end(), rng); 166 | } 167 | for (auto j = 0; j < np; j++) { 168 | memcpy( 169 | xslice.data() + j * d, 170 | x + perm[j] * dim_ + m * dsub_, 171 | d * sizeof(real)); 172 | } 173 | kmeans(xslice.data(), get_centroids(m, 0), np, d); 174 | } 175 | } 176 | 177 | real ProductQuantizer::mulcode( 178 | const Vector& x, 179 | const uint8_t* codes, 180 | int32_t t, 181 | real alpha) const { 182 | real res = 0.0; 183 | auto d = dsub_; 184 | const uint8_t* code = codes + nsubq_ * t; 185 | for (auto m = 0; m < nsubq_; m++) { 186 | const real* c = get_centroids(m, code[m]); 187 | if (m == nsubq_ - 1) { 188 | d = lastdsub_; 189 | } 190 | for (auto n = 0; n < d; n++) { 191 | res += x[m * dsub_ + n] * c[n]; 192 | } 193 | } 194 | return res * alpha; 195 | } 196 | 197 | void ProductQuantizer::addcode( 198 | Vector& x, 199 | const uint8_t* codes, 200 | int32_t t, 201 | real alpha) const { 202 | auto d = dsub_; 203 | const uint8_t* code = codes + nsubq_ * t; 204 | for (auto m = 0; m < nsubq_; m++) { 205 | const real* c = get_centroids(m, code[m]); 206 | if (m == nsubq_ - 1) { 207 | d = lastdsub_; 208 | } 209 | for (auto n = 0; n < d; n++) { 210 | x[m * dsub_ + n] += alpha * c[n]; 211 | } 212 | } 213 | } 214 | 215 | void ProductQuantizer::compute_code(const real* x, uint8_t* code) const { 216 | auto d = dsub_; 217 | for (auto m = 0; m < nsubq_; m++) { 218 | if (m == nsubq_ - 1) { 219 | d = lastdsub_; 220 | } 221 | assign_centroid(x + m * dsub_, get_centroids(m, 0), code + m, d); 222 | } 223 | } 224 | 225 | void ProductQuantizer::compute_codes(const real* x, uint8_t* codes, int32_t n) 226 | const { 227 | for (auto i = 0; i < n; i++) { 228 | compute_code(x + i * dim_, codes + i * nsubq_); 229 | } 230 | } 231 | 232 | void ProductQuantizer::save(std::ostream& out) { 233 | out.write((char*)&dim_, sizeof(dim_)); 234 | out.write((char*)&nsubq_, sizeof(nsubq_)); 235 | out.write((char*)&dsub_, sizeof(dsub_)); 236 | out.write((char*)&lastdsub_, sizeof(lastdsub_)); 237 | out.write((char*)centroids_.data(), centroids_.size() * sizeof(real)); 238 | } 239 | 240 | void ProductQuantizer::load(std::istream& in) { 241 | in.read((char*)&dim_, sizeof(dim_)); 242 | in.read((char*)&nsubq_, sizeof(nsubq_)); 243 | in.read((char*)&dsub_, sizeof(dsub_)); 244 | in.read((char*)&lastdsub_, sizeof(lastdsub_)); 245 | centroids_.resize(dim_ * ksub_); 246 | for (auto i = 0; i < centroids_.size(); i++) { 247 | in.read((char*)¢roids_[i], sizeof(real)); 248 | } 249 | } 250 | 251 | } // namespace fasttext 252 | -------------------------------------------------------------------------------- /man/fasttext_interface.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fasttext_interface.R 3 | \name{fasttext_interface} 4 | \alias{fasttext_interface} 5 | \title{Interface for the fasttext library} 6 | \usage{ 7 | fasttext_interface( 8 | list_params, 9 | path_output = "", 10 | MilliSecs = 100, 11 | path_input = "", 12 | remove_previous_file = TRUE, 13 | print_process_time = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{list_params}{a list of valid parameters} 18 | 19 | \item{path_output}{a character string specifying the file path where the process-logs (or output in generally) should be saved} 20 | 21 | \item{MilliSecs}{an integer specifying the delay in milliseconds when printing the results to the specified \emph{path_output}} 22 | 23 | \item{path_input}{a character string specifying the path to the input data file} 24 | 25 | \item{remove_previous_file}{a boolean. If TRUE, in case that the \emph{path_output} is not an empty string (""), then an existing file with the same output name will be removed} 26 | 27 | \item{print_process_time}{a boolean. If TRUE then the processing time of the function will be printed out in the R session} 28 | } 29 | \value{ 30 | a vector of class character that includes the parameters and file paths used as input to the function 31 | } 32 | \description{ 33 | Interface for the fasttext library 34 | } 35 | \details{ 36 | This function allows the user to run the various methods included in the fasttext library from within R 37 | 38 | The "output" parameter which exists in the named list (see examples section) and is passed to the "list_params" parameter of the "fasttext_interface()" function, is a file path and not a directory name and will actually return two files (a *.vec* and a *.bin*) to the output directory. 39 | } 40 | \examples{ 41 | 42 | \dontrun{ 43 | 44 | library(fastText) 45 | 46 | 47 | #################################################################################### 48 | # If the user intends to run the following examples then he / she must replace # 49 | # the 'input', 'output', 'path_input', 'path_output', 'model' and 'test_data' file # 50 | # paths depending on where the data are located or should be saved! # 51 | # ( 'tempdir()' is used here as an example folder ) # 52 | #################################################################################### 53 | 54 | 55 | # ------------------------------------------------ 56 | # print information for the Usage of each function [ parameters ] 57 | # ------------------------------------------------ 58 | 59 | fastText::printUsage() 60 | fastText::printTestUsage() 61 | fastText::printTestLabelUsage() 62 | fastText::printQuantizeUsage() 63 | fastText::printPrintWordVectorsUsage() 64 | fastText::printPrintSentenceVectorsUsage() 65 | fastText::printPrintNgramsUsage() 66 | fastText::printPredictUsage() 67 | fastText::printNNUsage() 68 | fastText::printDumpUsage() 69 | fastText::printAnalogiesUsage() 70 | fastText::print_parameters(command = "supervised") 71 | 72 | # ----------------------------------------------------------------------- 73 | # In case that the 'command' is one of 'cbow', 'skipgram' or 'supervised' 74 | # ----------------------------------------------------------------------- 75 | 76 | list_params = list(command = 'cbow', 77 | lr = 0.1, 78 | dim = 200, 79 | input = file.path(tempdir(), "doc.txt"), 80 | output = tempdir(), 81 | verbose = 2, 82 | thread = 1) 83 | 84 | res = fasttext_interface(list_params, 85 | path_output = file.path(tempdir(),"model_logs.txt"), 86 | MilliSecs = 100) 87 | 88 | 89 | # --------------------- 90 | # 'supervised' training 91 | # --------------------- 92 | 93 | list_params = list(command = 'supervised', 94 | lr = 0.1, 95 | dim = 200, 96 | input = file.path(tempdir(), "cooking.train"), 97 | output = file.path(tempdir(), "model_cooking"), 98 | verbose = 2, 99 | thread = 1) 100 | 101 | res = fasttext_interface(list_params, 102 | path_output = file.path(tempdir(), 'logs_supervise.txt'), 103 | MilliSecs = 5) 104 | 105 | # --------------------------------------- 106 | # In case that the 'command' is 'predict' 107 | # --------------------------------------- 108 | 109 | list_params = list(command = 'predict', 110 | model = file.path(tempdir(), 'model_cooking.bin'), 111 | test_data = file.path(tempdir(), 'cooking.valid'), 112 | k = 1, 113 | th = 0.0) 114 | 115 | res = fasttext_interface(list_params, 116 | path_output = file.path(tempdir(), 'predict_valid.txt')) 117 | 118 | 119 | # ------------------------------------ 120 | # In case that the 'command' is 'test' [ k = 5 , means that precision and recall are at 5 ] 121 | # ------------------------------------ 122 | 123 | list_params = list(command = 'test', 124 | model = file.path(tempdir(), 'model_cooking.bin'), 125 | test_data = file.path(tempdir(), 'cooking.valid'), 126 | k = 5, 127 | th = 0.0) 128 | 129 | res = fasttext_interface(list_params) # It only prints 'Precision', 'Recall' to the R session 130 | 131 | 132 | # ------------------------------------------ 133 | # In case that the 'command' is 'test-label' [ k = 5 , means that precision and recall are at 5 ] 134 | # ------------------------------------------ 135 | 136 | list_params = list(command = 'test-label', 137 | model = file.path(tempdir(), 'model_cooking.bin'), 138 | test_data = file.path(tempdir(), 'cooking.valid'), 139 | k = 5, 140 | th = 0.0) 141 | 142 | res = fasttext_interface(list_params, # prints also 'Precision', 'Recall' to R session 143 | path_output = file.path(tempdir(), "test_valid.txt")) 144 | 145 | # ----------------- 146 | # quantize function [ it will take a .bin file and return an .ftz file ] 147 | # ----------------- 148 | 149 | # the quantize function is currenlty (01/02/2019) single-threaded 150 | # https://github.com/facebookresearch/fastText/issues/353#issuecomment-342501742 151 | 152 | list_params = list(command = 'quantize', 153 | input = file.path(tempdir(), 'model_cooking.bin'), 154 | output = file.path(tempdir(), gsub('.bin', '.ftz', 'model_cooking.bin'))) 155 | 156 | res = fasttext_interface(list_params) 157 | 158 | 159 | # ----------------- 160 | # quantize function [ by using the optional parameters 'qnorm' and 'qout' ] 161 | # ----------------- 162 | 163 | list_params = list(command = 'quantize', 164 | input = file.path(tempdir(), 'model_cooking.bin'), 165 | output = file.path(tempdir(), gsub('.bin', '.ftz', 'model_cooking.bin')), 166 | qnorm = TRUE, 167 | qout = TRUE) 168 | 169 | res = fasttext_interface(list_params) 170 | 171 | 172 | # ------------------ 173 | # print-word-vectors [ each line of the 'queries.txt' must be a single word ] 174 | # ------------------ 175 | 176 | list_params = list(command = 'print-word-vectors', 177 | model = file.path(tempdir(), 'model_cooking.bin')) 178 | 179 | res = fasttext_interface(list_params, 180 | path_input = file.path(tempdir(), 'queries.txt'), 181 | path_output = file.path(tempdir(), 'print_vecs_file.txt')) 182 | 183 | 184 | # ---------------------- 185 | # print-sentence-vectors [ See also the comments in the main.cc file about the input-file ] 186 | # ---------------------- 187 | 188 | list_params = list(command = 'print-sentence-vectors', 189 | model = file.path(tempdir(), 'model_cooking.bin')) 190 | 191 | res = fasttext_interface(list_params, 192 | path_input = file.path(tempdir(), 'text.txt'), 193 | path_output = file.path(tempdir(), 'SENTENCE_VECs.txt')) 194 | 195 | 196 | # ------------ 197 | # print-ngrams [ print to console or to output-file ] 198 | # ------------ 199 | 200 | list_params = list(command = 'skipgram', lr = 0.1, dim = 200, 201 | input = file.path(tempdir(), "doc.txt"), 202 | output = tempdir(), verbose = 2, thread = 1, 203 | minn = 2, maxn = 2) 204 | 205 | res = fasttext_interface(list_params, 206 | path_output = file.path(tempdir(), "ngram_out.txt"), 207 | MilliSecs = 5) 208 | 209 | list_params = list(command = 'print-ngrams', 210 | model = file.path(tempdir(), 'ngram_out.bin'), 211 | word = 'word') # print n-grams for specific word 212 | 213 | res = fasttext_interface(list_params, path_output = "") # print output to console 214 | res = fasttext_interface(list_params, 215 | path_output = file.path(tempdir(), "NGRAMS.txt")) # output to file 216 | 217 | 218 | # ------------- 219 | # 'nn' function 220 | # ------------- 221 | 222 | list_params = list(command = 'nn', 223 | model = file.path(tempdir(), 'model_cooking.bin'), 224 | k = 20, 225 | query_word = 'word') # a 'query_word' is required 226 | 227 | res = fasttext_interface(list_params, 228 | path_output = file.path(tempdir(), "nn_output.txt")) 229 | 230 | 231 | # --------- 232 | # analogies [ in the output file each analogy-triplet-result is separated with a newline ] 233 | # --------- 234 | 235 | list_params = list(command = 'analogies', 236 | model = file.path(tempdir(), 'model_cooking.bin'), 237 | k = 5) 238 | 239 | res = fasttext_interface(list_params, 240 | path_input = file.path(tempdir(), 'analogy_queries.txt'), 241 | path_output = file.path(tempdir(), 'analogies_output.txt')) 242 | 243 | # ------------- 244 | # dump function [ the 'option' param should be one of 'args', 'dict', 'input' or 'output' ] 245 | # ------------- 246 | 247 | list_params = list(command = 'dump', 248 | model = file.path(tempdir(), 'model_cooking.bin'), 249 | option = 'args') 250 | 251 | res = fasttext_interface(list_params, 252 | path_output = file.path(tempdir(), "DUMP.txt")) 253 | 254 | } 255 | } 256 | \references{ 257 | https://github.com/facebookresearch/fastText 258 | 259 | https://github.com/facebookresearch/fastText/blob/master/docs/supervised-tutorial.md 260 | } 261 | -------------------------------------------------------------------------------- /tests/testthat/declaration_human_rights_english.txt: -------------------------------------------------------------------------------- 1 | Universal Declaration of Human Rights 2 | Preamble 3 | Whereas recognition of the inherent dignity and of the equal and inalienable 4 | rights of all members of the human family is the foundation of freedom, justice 5 | and peace in the world, 6 | Whereas disregard and contempt for human rights have resulted in barbarous 7 | acts which have outraged the conscience of mankind, and the advent of a world 8 | in which human beings shall enjoy freedom of speech and belief and freedom 9 | from fear and want has been proclaimed as the highest aspiration of the common 10 | people, 11 | Whereas it is essential, if man is not to be compelled to have recourse, as a last 12 | resort, to rebellion against tyranny and oppression, that human rights should be 13 | protected by the rule of law, 14 | Whereas it is essential to promote the development of friendly relations between 15 | nations, 16 | Whereas the peoples of the United Nations have in the Charter reaffirmed their 17 | faith in fundamental human rights, in the dignity and worth of the human person 18 | and in the equal rights of men and women and have determined to promote 19 | social progress and better standards of life in larger freedom, 20 | Whereas Member States have pledged themselves to achieve, in cooperation 21 | with the United Nations, the promotion of universal respect for and observance of 22 | human rights and fundamental freedoms, 23 | Whereas a common understanding of these rights and freedoms is of the 24 | greatest importance for the full realization of this pledge, 25 | Now, therefore, 26 | The General Assembly, 27 | Proclaims this Universal Declaration of Human Rights as a common standard of 28 | achievement for all peoples and all nations, to the end that every individual and 29 | every organ of society, keeping this Declaration constantly in mind, shall strive by 30 | 31 | teaching and education to promote respect for these rights and freedoms and by 32 | progressive measures, national and international, to secure their universal and 33 | effective recognition and observance, both among the peoples of Member States 34 | themselves and among the peoples of territories under their jurisdiction. 35 | Article I 36 | All human beings are born free and equal in dignity and rights. They are 37 | endowed with reason and conscience and should act towards one another in a 38 | spirit of brotherhood. 39 | Article 2 40 | Everyone is entitled to all the rights and freedoms set forth in this Declaration, 41 | without distinction of any kind, such as race, colour, sex, language, religion, 42 | political or other opinion, national or social origin, property, birth or other status. 43 | Furthermore, no distinction shall be made on the basis of the political, 44 | jurisdictional or international status of the country or territory to which a person 45 | belongs, whether it be independent, trust, non-self-governing or under any other 46 | limitation of sovereignty. 47 | Article 3 48 | Everyone has the right to life, liberty and the security of person. 49 | Article 4 50 | No one shall be held in slavery or servitude; slavery and the slave trade shall be 51 | prohibited in all their forms. 52 | Article 5 53 | No one shall be subjected to torture or to cruel, inhuman or degrading treatment 54 | or punishment. 55 | 56 | Article 6 57 | Everyone has the right to recognition everywhere as a person before the law. 58 | Article 7 59 | All are equal before the law and are entitled without any discrimination to equal 60 | protection of the law. All are entitled to equal protection against any 61 | discrimination in violation of this Declaration and against any incitement to such 62 | discrimination. 63 | Article 8 64 | Everyone has the right to an effective remedy by the competent national tribunals 65 | for acts violating the fundamental rights granted him by the constitution or by law. 66 | Article 9 67 | No one shall be subjected to arbitrary arrest, detention or exile. 68 | Article 10 69 | Everyone is entitled in full equality to a fair and public hearing by an independent 70 | and impartial tribunal, in the determination of his rights and obligations and of any 71 | criminal charge against him. 72 | Article 11 73 | 1. Everyone charged with a penal offence has the right to be presumed 74 | innocent until proved guilty according to law in a public trial at which he 75 | has had all the guarantees necessary for his defence. 76 | 2. No one shall be held guilty of any penal offence on account of any act or 77 | omission which did not constitute a penal offence, under national or 78 | international law, at the time when it was committed. Nor shall a heavier 79 | 80 | penalty be imposed than the one that was applicable at the time the penal 81 | offence was committed. 82 | Article 12 83 | No one shall be subjected to arbitrary interference with his privacy, family, home 84 | or correspondence, nor to attacks upon his honour and reputation. Everyone has 85 | the right to the protection of the law against such interference or attacks. 86 | Article 13 87 | 1. Everyone has the right to freedom of movement and residence within the 88 | borders of each State. 89 | 2. Everyone has the right to leave any country, including his own, and to 90 | return to his country. 91 | Article 14 92 | 1. Everyone has the right to seek and to enjoy in other countries asylum from 93 | persecution. 94 | 2. This right may not be invoked in the case of prosecutions genuinely 95 | arising from non-political crimes or from acts contrary to the purposes and 96 | principles of the United Nations. 97 | Article 15 98 | 1. Everyone has the right to a nationality. 99 | 2. No one shall be arbitrarily deprived of his nationality nor denied the right to 100 | change his nationality. 101 | Article 16 102 | 103 | 1. Men and women of full age, without any limitation due to race, nationality 104 | or religion, have the right to marry and to found a family. They are entitled 105 | to equal rights as to marriage, during marriage and at its dissolution. 106 | 2. Marriage shall be entered into only with the free and full consent of the 107 | intending spouses. 108 | 3. The family is the natural and fundamental group unit of society and is 109 | entitled to protection by society and the State. 110 | Article 17 111 | 1. Everyone has the right to own property alone as well as in association with 112 | others. 113 | 2. No one shall be arbitrarily deprived of his property. 114 | Article 18 115 | Everyone has the right to freedom of thought, conscience and religion; this right 116 | includes freedom to change his religion or belief, and freedom, either alone or in 117 | community with others and in public or private, to manifest his religion or belief in 118 | teaching, practice, worship and observance. 119 | Article 19 120 | Everyone has the right to freedom of opinion and expression; this right includes 121 | freedom to hold opinions without interference and to seek, receive and impart 122 | information and ideas through any media and regardless of frontiers. 123 | Article 20 124 | 1. Everyone has the right to freedom of peaceful assembly and association. 125 | 2. No one may be compelled to belong to an association. 126 | Article 21 127 | 128 | 1. Everyone has the right to take part in the government of his country, 129 | directly or through freely chosen representatives. 130 | 2. Everyone has the right to equal access to public service in his country. 131 | 3. The will of the people shall be the basis of the authority of government; 132 | this will shall be expressed in periodic and genuine elections which shall 133 | be by universal and equal suffrage and shall be held by secret vote or by 134 | equivalent free voting procedures. 135 | Article 22 136 | Everyone, as a member of society, has the right to social security and is entitled 137 | to realization, through national effort and international co-operation and in 138 | accordance with the organization and resources of each State, of the economic, 139 | social and cultural rights indispensable for his dignity and the free development 140 | of his personality. 141 | Article 23 142 | 1. Everyone has the right to work, to free choice of employment, to just and 143 | favourable conditions of work and to protection against unemployment. 144 | 2. Everyone, without any discrimination, has the right to equal pay for equal 145 | work. 146 | 3. Everyone who works has the right to just and favourable remuneration 147 | ensuring for himself and his family an existence worthy of human dignity, 148 | and supplemented, if necessary, by other means of social protection. 149 | 4. Everyone has the right to form and to join trade unions for the protection of 150 | his interests. 151 | Article 24 152 | Everyone has the right to rest and leisure, including reasonable limitation of 153 | working hours and periodic holidays with pay. 154 | 155 | Article 25 156 | 1. Everyone has the right to a standard of living adequate for the health and 157 | well-being of himself and of his family, including food, clothing, housing 158 | and medical care and necessary social services, and the right to security 159 | in the event of unemployment, sickness, disability, widowhood, old age or 160 | other lack of livelihood in circumstances beyond his control. 161 | 2. Motherhood and childhood are entitled to special care and assistance. All 162 | children, whether born in or out of wedlock, shall enjoy the same social 163 | protection. 164 | Article 26 165 | 1. Everyone has the right to education. Education shall be free, at least in the 166 | elementary and fundamental stages. Elementary education shall be 167 | compulsory. Technical and professional education shall be made 168 | generally available and higher education shall be equally accessible to all 169 | on the basis of merit. 170 | 2. Education shall be directed to the full development of the human 171 | personality and to the strengthening of respect for human rights and 172 | fundamental freedoms. It shall promote understanding, tolerance and 173 | friendship among all nations, racial or religious groups, and shall further 174 | the activities of the United Nations for the maintenance of peace. 175 | 3. Parents have a prior right to choose the kind of education that shall be 176 | given to their children. 177 | Article 27 178 | 1. Everyone has the right freely to participate in the cultural life of the 179 | community, to enjoy the arts and to share in scientific advancement and 180 | its benefits. 181 | 182 | 2. Everyone has the right to the protection of the moral and material interests 183 | resulting from any scientific, literary or artistic production of which he is the 184 | author. 185 | Article 28 186 | Everyone is entitled to a social and international order in which the rights and 187 | freedoms set forth in this Declaration can be fully realized. 188 | Article 29 189 | 1. Everyone has duties to the community in which alone the free and full 190 | development of his personality is possible. 191 | 2. In the exercise of his rights and freedoms, everyone shall be subject only 192 | to such limitations as are determined by law solely for the purpose of 193 | securing due recognition and respect for the rights and freedoms of others 194 | and of meeting the just requirements of morality, public order and the 195 | general welfare in a democratic society. 196 | 3. These rights and freedoms may in no case be exercised contrary to the 197 | purposes and principles of the United Nations. 198 | Article 30 199 | Nothing in this Declaration may be interpreted as implying for any State, group or 200 | person any right to engage in any activity or to perform any act aimed at the 201 | destruction of any of the rights and freedoms set forth herein. 202 | 203 | -------------------------------------------------------------------------------- /src/args.cc: -------------------------------------------------------------------------------- 1 | # include 2 | // [[Rcpp::depends("Rcpp")]] 3 | // [[Rcpp::plugins(cpp11)]] 4 | 5 | /** 6 | * Copyright (c) 2016-present, Facebook, Inc. 7 | * All rights reserved. 8 | * 9 | * This source code is licensed under the MIT license found in the 10 | * LICENSE file in the root directory of this source tree. 11 | */ 12 | 13 | #include "args.h" 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | 20 | namespace fasttext { 21 | 22 | Args::Args() { 23 | lr = 0.05; 24 | dim = 100; 25 | ws = 5; 26 | epoch = 5; 27 | minCount = 5; 28 | minCountLabel = 0; 29 | neg = 5; 30 | wordNgrams = 1; 31 | loss = loss_name::ns; 32 | model = model_name::sg; 33 | bucket = 2000000; 34 | minn = 3; 35 | maxn = 6; 36 | thread = 12; 37 | lrUpdateRate = 100; 38 | t = 1e-4; 39 | label = "__label__"; 40 | verbose = 2; 41 | pretrainedVectors = ""; 42 | saveOutput = false; 43 | 44 | qout = false; 45 | retrain = false; 46 | qnorm_param = false; 47 | cutoff = 0; 48 | dsub = 2; 49 | } 50 | 51 | std::string Args::lossToString(loss_name ln) const { 52 | switch (ln) { 53 | case loss_name::hs: 54 | return "hs"; 55 | case loss_name::ns: 56 | return "ns"; 57 | case loss_name::softmax: 58 | return "softmax"; 59 | case loss_name::ova: 60 | return "one-vs-all"; 61 | } 62 | return "Unknown loss!"; // should never happen 63 | } 64 | 65 | std::string Args::boolToString(bool b) const { 66 | if (b) { 67 | return "true"; 68 | } else { 69 | return "false"; 70 | } 71 | } 72 | 73 | std::string Args::modelToString(model_name mn) const { 74 | switch (mn) { 75 | case model_name::cbow: 76 | return "cbow"; 77 | case model_name::sg: 78 | return "sg"; 79 | case model_name::sup: 80 | return "sup"; 81 | } 82 | return "Unknown model name!"; // should never happen 83 | } 84 | 85 | void Args::parseArgs(const std::vector& args) { 86 | std::string command(args[1]); 87 | if (command == "supervised") { 88 | model = model_name::sup; 89 | loss = loss_name::softmax; 90 | minCount = 1; 91 | minn = 0; 92 | maxn = 0; 93 | lr = 0.1; 94 | } else if (command == "cbow") { 95 | model = model_name::cbow; 96 | } 97 | for (int ai = 2; ai < args.size(); ai += 2) { 98 | if (args[ai][0] != '-') { 99 | Rcpp::Rcout << args[ai - 1] << " " << args[ai] << std::endl; 100 | Rcpp::Rcout << "Provided argument without a dash! Usage:" << std::endl; 101 | printHelp(); 102 | Rcpp::stop("EXIT_FAILURE -- args.cc file -- Args::parseArgs function"); 103 | } 104 | try { 105 | if (args[ai] == "-h") { 106 | Rcpp::Rcout << "Here is the help! Usage:" << std::endl; 107 | printHelp(); 108 | Rcpp::stop("EXIT_FAILURE -- args.cc file -- Args::parseArgs function"); 109 | } else if (args[ai] == "-input") { 110 | input = std::string(args.at(ai + 1)); 111 | } else if (args[ai] == "-output") { 112 | output = std::string(args.at(ai + 1)); 113 | } else if (args[ai] == "-lr") { 114 | lr = std::stof(args.at(ai + 1)); 115 | } else if (args[ai] == "-lrUpdateRate") { 116 | lrUpdateRate = std::stoi(args.at(ai + 1)); 117 | } else if (args[ai] == "-dim") { 118 | dim = std::stoi(args.at(ai + 1)); 119 | } else if (args[ai] == "-ws") { 120 | ws = std::stoi(args.at(ai + 1)); 121 | } else if (args[ai] == "-epoch") { 122 | epoch = std::stoi(args.at(ai + 1)); 123 | } else if (args[ai] == "-minCount") { 124 | minCount = std::stoi(args.at(ai + 1)); 125 | } else if (args[ai] == "-minCountLabel") { 126 | minCountLabel = std::stoi(args.at(ai + 1)); 127 | } else if (args[ai] == "-neg") { 128 | neg = std::stoi(args.at(ai + 1)); 129 | } else if (args[ai] == "-wordNgrams") { 130 | wordNgrams = std::stoi(args.at(ai + 1)); 131 | } else if (args[ai] == "-loss") { 132 | if (args.at(ai + 1) == "hs") { 133 | loss = loss_name::hs; 134 | } else if (args.at(ai + 1) == "ns") { 135 | loss = loss_name::ns; 136 | } else if (args.at(ai + 1) == "softmax") { 137 | loss = loss_name::softmax; 138 | } else if ( 139 | args.at(ai + 1) == "one-vs-all" || args.at(ai + 1) == "ova") { 140 | loss = loss_name::ova; 141 | } else { 142 | Rcpp::Rcout << "Unknown loss: " << args.at(ai + 1) << std::endl; 143 | printHelp(); 144 | Rcpp::stop("EXIT_FAILURE -- args.cc file -- Args::parseArgs function"); 145 | } 146 | } else if (args[ai] == "-bucket") { 147 | bucket = std::stoi(args.at(ai + 1)); 148 | } else if (args[ai] == "-minn") { 149 | minn = std::stoi(args.at(ai + 1)); 150 | } else if (args[ai] == "-maxn") { 151 | maxn = std::stoi(args.at(ai + 1)); 152 | } else if (args[ai] == "-thread") { 153 | thread = std::stoi(args.at(ai + 1)); 154 | } else if (args[ai] == "-t") { 155 | t = std::stof(args.at(ai + 1)); 156 | } else if (args[ai] == "-label") { 157 | label = std::string(args.at(ai + 1)); 158 | } else if (args[ai] == "-verbose") { 159 | verbose = std::stoi(args.at(ai + 1)); 160 | } else if (args[ai] == "-pretrainedVectors") { 161 | pretrainedVectors = std::string(args.at(ai + 1)); 162 | } else if (args[ai] == "-saveOutput") { 163 | saveOutput = true; 164 | ai--; 165 | } else if (args[ai] == "-qnorm") { 166 | qnorm_param = true; 167 | ai--; 168 | } else if (args[ai] == "-retrain") { 169 | retrain = true; 170 | ai--; 171 | } else if (args[ai] == "-qout") { 172 | qout = true; 173 | ai--; 174 | } else if (args[ai] == "-cutoff") { 175 | cutoff = std::stoi(args.at(ai + 1)); 176 | } else if (args[ai] == "-dsub") { 177 | dsub = std::stoi(args.at(ai + 1)); 178 | } else { 179 | Rcpp::Rcout << "Unknown argument: " << args[ai] << std::endl; 180 | printHelp(); 181 | Rcpp::stop("EXIT_FAILURE -- args.cc file -- Args::parseArgs function"); 182 | } 183 | } catch ( const std::out_of_range& ) { // use by reference to avoid the following warning [ https://lists.launchpad.net/kicad-developers/msg36082.html ] 184 | // catch (std::out_of_range) { // initially it gave on Debian OS only : "warning: catching polymorphic type ‘class std::out_of_range’ by value [-Wcatch-value=]" 185 | Rcpp::Rcout << args[ai] << " is missing an argument" << std::endl; 186 | printHelp(); 187 | Rcpp::stop("EXIT_FAILURE -- args.cc file -- Args::parseArgs function"); 188 | } 189 | } 190 | if (input.empty() || output.empty()) { 191 | Rcpp::Rcout << "Empty input or output path." << std::endl; 192 | printHelp(); 193 | Rcpp::stop("EXIT_FAILURE -- args.cc file -- Args::parseArgs function"); 194 | } 195 | if (wordNgrams <= 1 && maxn == 0) { 196 | bucket = 0; 197 | } 198 | } 199 | 200 | void Args::printHelp() { 201 | printBasicHelp(); 202 | printDictionaryHelp(); 203 | printTrainingHelp(); 204 | printQuantizationHelp(); 205 | } 206 | 207 | void Args::printBasicHelp() { 208 | Rcpp::Rcout << "\nThe following arguments are mandatory:\n" 209 | << " -input training file path\n" 210 | << " -output output file path\n" 211 | << "\nThe following arguments are optional:\n" 212 | << " -verbose verbosity level [" << verbose << "]\n"; 213 | } 214 | 215 | void Args::printDictionaryHelp() { 216 | Rcpp::Rcout << "\nThe following arguments for the dictionary are optional:\n" 217 | << " -minCount minimal number of word occurences [" 218 | << minCount << "]\n" 219 | << " -minCountLabel minimal number of label occurences [" 220 | << minCountLabel << "]\n" 221 | << " -wordNgrams max length of word ngram [" << wordNgrams 222 | << "]\n" 223 | << " -bucket number of buckets [" << bucket << "]\n" 224 | << " -minn min length of char ngram [" << minn 225 | << "]\n" 226 | << " -maxn max length of char ngram [" << maxn 227 | << "]\n" 228 | << " -t sampling threshold [" << t << "]\n" 229 | << " -label labels prefix [" << label << "]\n"; 230 | } 231 | 232 | void Args::printTrainingHelp() { 233 | Rcpp::Rcout 234 | << "\nThe following arguments for training are optional:\n" 235 | << " -lr learning rate [" << lr << "]\n" 236 | << " -lrUpdateRate change the rate of updates for the learning rate [" 237 | << lrUpdateRate << "]\n" 238 | << " -dim size of word vectors [" << dim << "]\n" 239 | << " -ws size of the context window [" << ws << "]\n" 240 | << " -epoch number of epochs [" << epoch << "]\n" 241 | << " -neg number of negatives sampled [" << neg << "]\n" 242 | << " -loss loss function {ns, hs, softmax, one-vs-all} [" 243 | << lossToString(loss) << "]\n" 244 | << " -thread number of threads [" << thread << "]\n" 245 | << " -pretrainedVectors pretrained word vectors for supervised learning [" 246 | << pretrainedVectors << "]\n" 247 | << " -saveOutput whether output params should be saved [" 248 | << boolToString(saveOutput) << "]\n"; 249 | } 250 | 251 | void Args::printQuantizationHelp() { 252 | Rcpp::Rcout 253 | << "\nThe following arguments for quantization are optional:\n" 254 | << " -cutoff number of words and ngrams to retain [" 255 | << cutoff << "]\n" 256 | << " -retrain whether embeddings are finetuned if a cutoff is applied [" 257 | << boolToString(retrain) << "]\n" 258 | << " -qnorm whether the norm is quantized separately [" 259 | << boolToString(qnorm_param) << "]\n" 260 | << " -qout whether the classifier is quantized [" 261 | << boolToString(qout) << "]\n" 262 | << " -dsub size of each sub-vector [" << dsub << "]\n"; 263 | } 264 | 265 | void Args::save(std::ostream& out) { 266 | out.write((char*)&(dim), sizeof(int)); 267 | out.write((char*)&(ws), sizeof(int)); 268 | out.write((char*)&(epoch), sizeof(int)); 269 | out.write((char*)&(minCount), sizeof(int)); 270 | out.write((char*)&(neg), sizeof(int)); 271 | out.write((char*)&(wordNgrams), sizeof(int)); 272 | out.write((char*)&(loss), sizeof(loss_name)); 273 | out.write((char*)&(model), sizeof(model_name)); 274 | out.write((char*)&(bucket), sizeof(int)); 275 | out.write((char*)&(minn), sizeof(int)); 276 | out.write((char*)&(maxn), sizeof(int)); 277 | out.write((char*)&(lrUpdateRate), sizeof(int)); 278 | out.write((char*)&(t), sizeof(double)); 279 | } 280 | 281 | void Args::load(std::istream& in) { 282 | in.read((char*)&(dim), sizeof(int)); 283 | in.read((char*)&(ws), sizeof(int)); 284 | in.read((char*)&(epoch), sizeof(int)); 285 | in.read((char*)&(minCount), sizeof(int)); 286 | in.read((char*)&(neg), sizeof(int)); 287 | in.read((char*)&(wordNgrams), sizeof(int)); 288 | in.read((char*)&(loss), sizeof(loss_name)); 289 | in.read((char*)&(model), sizeof(model_name)); 290 | in.read((char*)&(bucket), sizeof(int)); 291 | in.read((char*)&(minn), sizeof(int)); 292 | in.read((char*)&(maxn), sizeof(int)); 293 | in.read((char*)&(lrUpdateRate), sizeof(int)); 294 | in.read((char*)&(t), sizeof(double)); 295 | } 296 | 297 | void Args::dump(std::ostream& out) const { 298 | out << "dim" 299 | << " " << dim << std::endl; 300 | out << "ws" 301 | << " " << ws << std::endl; 302 | out << "epoch" 303 | << " " << epoch << std::endl; 304 | out << "minCount" 305 | << " " << minCount << std::endl; 306 | out << "neg" 307 | << " " << neg << std::endl; 308 | out << "wordNgrams" 309 | << " " << wordNgrams << std::endl; 310 | out << "loss" 311 | << " " << lossToString(loss) << std::endl; 312 | out << "model" 313 | << " " << modelToString(model) << std::endl; 314 | out << "bucket" 315 | << " " << bucket << std::endl; 316 | out << "minn" 317 | << " " << minn << std::endl; 318 | out << "maxn" 319 | << " " << maxn << std::endl; 320 | out << "lrUpdateRate" 321 | << " " << lrUpdateRate << std::endl; 322 | out << "t" 323 | << " " << t << std::endl; 324 | } 325 | 326 | } // namespace fasttext 327 | -------------------------------------------------------------------------------- /src/model.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the MIT license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include "model.h" 10 | #include "utils.h" 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | namespace fasttext { 17 | 18 | constexpr int64_t SIGMOID_TABLE_SIZE = 512; 19 | constexpr int64_t MAX_SIGMOID = 8; 20 | constexpr int64_t LOG_TABLE_SIZE = 512; 21 | 22 | Model::Model( 23 | std::shared_ptr wi, 24 | std::shared_ptr wo, 25 | std::shared_ptr args, 26 | int32_t seed) 27 | : hidden_(args->dim), 28 | output_(wo->size(0)), 29 | grad_(args->dim), 30 | rng(seed), 31 | quant_(false) { 32 | wi_ = wi; 33 | wo_ = wo; 34 | args_ = args; 35 | osz_ = wo->size(0); 36 | hsz_ = args->dim; 37 | negpos = 0; 38 | loss_ = 0.0; 39 | nexamples_ = 1; 40 | t_sigmoid_.reserve(SIGMOID_TABLE_SIZE + 1); 41 | t_log_.reserve(LOG_TABLE_SIZE + 1); 42 | initSigmoid(); 43 | initLog(); 44 | } 45 | 46 | void Model::setQuantizePointer( 47 | std::shared_ptr qwi, 48 | std::shared_ptr qwo, 49 | bool qout) { 50 | qwi_ = qwi; 51 | qwo_ = qwo; 52 | if (qout) { 53 | osz_ = qwo_->getM(); 54 | } 55 | } 56 | 57 | real Model::binaryLogistic(int32_t target, bool label, real lr) { 58 | real score = sigmoid(wo_->dotRow(hidden_, target)); 59 | real alpha = lr * (real(label) - score); 60 | grad_.addRow(*wo_, target, alpha); 61 | wo_->addRow(hidden_, target, alpha); 62 | if (label) { 63 | return -log(score); 64 | } else { 65 | return -log(1.0 - score); 66 | } 67 | } 68 | 69 | real Model::negativeSampling(int32_t target, real lr) { 70 | real loss = 0.0; 71 | grad_.zero(); 72 | for (int32_t n = 0; n <= args_->neg; n++) { 73 | if (n == 0) { 74 | loss += binaryLogistic(target, true, lr); 75 | } else { 76 | loss += binaryLogistic(getNegative(target), false, lr); 77 | } 78 | } 79 | return loss; 80 | } 81 | 82 | real Model::hierarchicalSoftmax(int32_t target, real lr) { 83 | real loss = 0.0; 84 | grad_.zero(); 85 | const std::vector& binaryCode = codes[target]; 86 | const std::vector& pathToRoot = paths[target]; 87 | for (int32_t i = 0; i < pathToRoot.size(); i++) { 88 | loss += binaryLogistic(pathToRoot[i], binaryCode[i], lr); 89 | } 90 | return loss; 91 | } 92 | 93 | void Model::computeOutput(Vector& hidden, Vector& output) const { 94 | if (quant_ && args_->qout) { 95 | output.mul(*qwo_, hidden); 96 | } else { 97 | output.mul(*wo_, hidden); 98 | } 99 | } 100 | 101 | void Model::computeOutputSigmoid(Vector& hidden, Vector& output) const { 102 | computeOutput(hidden, output); 103 | for (int32_t i = 0; i < osz_; i++) { 104 | output[i] = sigmoid(output[i]); 105 | } 106 | } 107 | 108 | void Model::computeOutputSoftmax(Vector& hidden, Vector& output) const { 109 | computeOutput(hidden, output); 110 | real max = output[0], z = 0.0; 111 | for (int32_t i = 0; i < osz_; i++) { 112 | max = std::max(output[i], max); 113 | } 114 | for (int32_t i = 0; i < osz_; i++) { 115 | output[i] = exp(output[i] - max); 116 | z += output[i]; 117 | } 118 | for (int32_t i = 0; i < osz_; i++) { 119 | output[i] /= z; 120 | } 121 | } 122 | 123 | void Model::computeOutputSoftmax() { 124 | computeOutputSoftmax(hidden_, output_); 125 | } 126 | 127 | real Model::softmax(int32_t target, real lr) { 128 | grad_.zero(); 129 | computeOutputSoftmax(); 130 | for (int32_t i = 0; i < osz_; i++) { 131 | real label = (i == target) ? 1.0 : 0.0; 132 | real alpha = lr * (label - output_[i]); 133 | grad_.addRow(*wo_, i, alpha); 134 | wo_->addRow(hidden_, i, alpha); 135 | } 136 | return -log(output_[target]); 137 | } 138 | 139 | real Model::oneVsAll(const std::vector& targets, real lr) { 140 | real loss = 0.0; 141 | for (int32_t i = 0; i < osz_; i++) { 142 | bool isMatch = utils::contains(targets, i); 143 | loss += binaryLogistic(i, isMatch, lr); 144 | } 145 | 146 | return loss; 147 | } 148 | 149 | void Model::computeHidden(const std::vector& input, Vector& hidden) 150 | const { 151 | assert(hidden.size() == hsz_); 152 | hidden.zero(); 153 | for (auto it = input.cbegin(); it != input.cend(); ++it) { 154 | if (quant_) { 155 | hidden.addRow(*qwi_, *it); 156 | } else { 157 | hidden.addRow(*wi_, *it); 158 | } 159 | } 160 | hidden.mul(1.0 / input.size()); 161 | } 162 | 163 | bool Model::comparePairs( 164 | const std::pair& l, 165 | const std::pair& r) { 166 | return l.first > r.first; 167 | } 168 | 169 | void Model::predict( 170 | const std::vector& input, 171 | int32_t k, 172 | real threshold, 173 | std::vector>& heap, 174 | Vector& hidden, 175 | Vector& output) const { 176 | if (k == Model::kUnlimitedPredictions) { 177 | k = osz_; 178 | } else if (k <= 0) { 179 | throw std::invalid_argument("k needs to be 1 or higher!"); 180 | } 181 | if (args_->model != model_name::sup) { 182 | throw std::invalid_argument("Model needs to be supervised for prediction!"); 183 | } 184 | heap.reserve(k + 1); 185 | computeHidden(input, hidden); 186 | if (args_->loss == loss_name::hs) { 187 | dfs(k, threshold, 2 * osz_ - 2, 0.0, heap, hidden); 188 | } else { 189 | findKBest(k, threshold, heap, hidden, output); 190 | } 191 | std::sort_heap(heap.begin(), heap.end(), comparePairs); 192 | } 193 | 194 | void Model::predict( 195 | const std::vector& input, 196 | int32_t k, 197 | real threshold, 198 | std::vector>& heap) { 199 | predict(input, k, threshold, heap, hidden_, output_); 200 | } 201 | 202 | void Model::findKBest( 203 | int32_t k, 204 | real threshold, 205 | std::vector>& heap, 206 | Vector& hidden, 207 | Vector& output) const { 208 | if (args_->loss == loss_name::ova) { 209 | computeOutputSigmoid(hidden, output); 210 | } else { 211 | computeOutputSoftmax(hidden, output); 212 | } 213 | for (int32_t i = 0; i < osz_; i++) { 214 | if (output[i] < threshold) { 215 | continue; 216 | } 217 | if (heap.size() == k && std_log(output[i]) < heap.front().first) { 218 | continue; 219 | } 220 | heap.push_back(std::make_pair(std_log(output[i]), i)); 221 | std::push_heap(heap.begin(), heap.end(), comparePairs); 222 | if (heap.size() > k) { 223 | std::pop_heap(heap.begin(), heap.end(), comparePairs); 224 | heap.pop_back(); 225 | } 226 | } 227 | } 228 | 229 | void Model::dfs( 230 | int32_t k, 231 | real threshold, 232 | int32_t node, 233 | real score, 234 | std::vector>& heap, 235 | Vector& hidden) const { 236 | if (score < std_log(threshold)) { 237 | return; 238 | } 239 | if (heap.size() == k && score < heap.front().first) { 240 | return; 241 | } 242 | 243 | if (tree[node].left == -1 && tree[node].right == -1) { 244 | heap.push_back(std::make_pair(score, node)); 245 | std::push_heap(heap.begin(), heap.end(), comparePairs); 246 | if (heap.size() > k) { 247 | std::pop_heap(heap.begin(), heap.end(), comparePairs); 248 | heap.pop_back(); 249 | } 250 | return; 251 | } 252 | 253 | real f; 254 | if (quant_ && args_->qout) { 255 | f = qwo_->dotRow(hidden, node - osz_); 256 | } else { 257 | f = wo_->dotRow(hidden, node - osz_); 258 | } 259 | f = 1. / (1 + std::exp(-f)); 260 | 261 | dfs(k, threshold, tree[node].left, score + std_log(1.0 - f), heap, hidden); 262 | dfs(k, threshold, tree[node].right, score + std_log(f), heap, hidden); 263 | } 264 | 265 | real Model::computeLoss( 266 | const std::vector& targets, 267 | int32_t targetIndex, 268 | real lr) { 269 | real loss = 0.0; 270 | 271 | if (args_->loss == loss_name::ns) { 272 | loss = negativeSampling(targets[targetIndex], lr); 273 | } else if (args_->loss == loss_name::hs) { 274 | loss = hierarchicalSoftmax(targets[targetIndex], lr); 275 | } else if (args_->loss == loss_name::softmax) { 276 | loss = softmax(targets[targetIndex], lr); 277 | } else if (args_->loss == loss_name::ova) { 278 | loss = oneVsAll(targets, lr); 279 | } else { 280 | throw std::invalid_argument("Unhandled loss function for this model."); 281 | } 282 | 283 | return loss; 284 | } 285 | 286 | void Model::update( 287 | const std::vector& input, 288 | const std::vector& targets, 289 | int32_t targetIndex, 290 | real lr) { 291 | if (input.size() == 0) { 292 | return; 293 | } 294 | computeHidden(input, hidden_); 295 | 296 | if (targetIndex == kAllLabelsAsTarget) { 297 | loss_ += computeLoss(targets, -1, lr); 298 | } else { 299 | assert(targetIndex >= 0); 300 | assert(targetIndex < osz_); 301 | loss_ += computeLoss(targets, targetIndex, lr); 302 | } 303 | 304 | nexamples_ += 1; 305 | 306 | if (args_->model == model_name::sup) { 307 | grad_.mul(1.0 / input.size()); 308 | } 309 | for (auto it = input.cbegin(); it != input.cend(); ++it) { 310 | wi_->addRow(grad_, *it, 1.0); 311 | } 312 | } 313 | 314 | void Model::setTargetCounts(const std::vector& counts) { 315 | assert(counts.size() == osz_); 316 | if (args_->loss == loss_name::ns) { 317 | initTableNegatives(counts); 318 | } 319 | if (args_->loss == loss_name::hs) { 320 | buildTree(counts); 321 | } 322 | } 323 | 324 | void Model::initTableNegatives(const std::vector& counts) { 325 | real z = 0.0; 326 | for (size_t i = 0; i < counts.size(); i++) { 327 | z += pow(counts[i], 0.5); 328 | } 329 | for (size_t i = 0; i < counts.size(); i++) { 330 | real c = pow(counts[i], 0.5); 331 | for (size_t j = 0; j < c * NEGATIVE_TABLE_SIZE / z; j++) { 332 | negatives_.push_back(i); 333 | } 334 | } 335 | std::shuffle(negatives_.begin(), negatives_.end(), rng); 336 | } 337 | 338 | int32_t Model::getNegative(int32_t target) { 339 | int32_t negative; 340 | do { 341 | negative = negatives_[negpos]; 342 | negpos = (negpos + 1) % negatives_.size(); 343 | } while (target == negative); 344 | return negative; 345 | } 346 | 347 | void Model::buildTree(const std::vector& counts) { 348 | tree.resize(2 * osz_ - 1); 349 | for (int32_t i = 0; i < 2 * osz_ - 1; i++) { 350 | tree[i].parent = -1; 351 | tree[i].left = -1; 352 | tree[i].right = -1; 353 | tree[i].count = 1e15; 354 | tree[i].binary = false; 355 | } 356 | for (int32_t i = 0; i < osz_; i++) { 357 | tree[i].count = counts[i]; 358 | } 359 | int32_t leaf = osz_ - 1; 360 | int32_t node = osz_; 361 | for (int32_t i = osz_; i < 2 * osz_ - 1; i++) { 362 | int32_t mini[2]; 363 | for (int32_t j = 0; j < 2; j++) { 364 | if (leaf >= 0 && tree[leaf].count < tree[node].count) { 365 | mini[j] = leaf--; 366 | } else { 367 | mini[j] = node++; 368 | } 369 | } 370 | tree[i].left = mini[0]; 371 | tree[i].right = mini[1]; 372 | tree[i].count = tree[mini[0]].count + tree[mini[1]].count; 373 | tree[mini[0]].parent = i; 374 | tree[mini[1]].parent = i; 375 | tree[mini[1]].binary = true; 376 | } 377 | for (int32_t i = 0; i < osz_; i++) { 378 | std::vector path; 379 | std::vector code; 380 | int32_t j = i; 381 | while (tree[j].parent != -1) { 382 | path.push_back(tree[j].parent - osz_); 383 | code.push_back(tree[j].binary); 384 | j = tree[j].parent; 385 | } 386 | paths.push_back(path); 387 | codes.push_back(code); 388 | } 389 | } 390 | 391 | real Model::getLoss() const { 392 | return loss_ / nexamples_; 393 | } 394 | 395 | void Model::initSigmoid() { 396 | for (int i = 0; i < SIGMOID_TABLE_SIZE + 1; i++) { 397 | real x = real(i * 2 * MAX_SIGMOID) / SIGMOID_TABLE_SIZE - MAX_SIGMOID; 398 | t_sigmoid_.push_back(1.0 / (1.0 + std::exp(-x))); 399 | } 400 | } 401 | 402 | void Model::initLog() { 403 | for (int i = 0; i < LOG_TABLE_SIZE + 1; i++) { 404 | real x = (real(i) + 1e-5) / LOG_TABLE_SIZE; 405 | t_log_.push_back(std::log(x)); 406 | } 407 | } 408 | 409 | real Model::log(real x) const { 410 | if (x > 1.0) { 411 | return 0.0; 412 | } 413 | int64_t i = int64_t(x * LOG_TABLE_SIZE); 414 | return t_log_[i]; 415 | } 416 | 417 | real Model::std_log(real x) const { 418 | return std::log(x + 1e-5); 419 | } 420 | 421 | real Model::sigmoid(real x) const { 422 | if (x < -MAX_SIGMOID) { 423 | return 0.0; 424 | } else if (x > MAX_SIGMOID) { 425 | return 1.0; 426 | } else { 427 | int64_t i = 428 | int64_t((x + MAX_SIGMOID) * SIGMOID_TABLE_SIZE / MAX_SIGMOID / 2); 429 | return t_sigmoid_[i]; 430 | } 431 | } 432 | 433 | } // namespace fasttext 434 | -------------------------------------------------------------------------------- /src/dictionary.cc: -------------------------------------------------------------------------------- 1 | # include 2 | // [[Rcpp::depends("Rcpp")]] 3 | // [[Rcpp::plugins(cpp11)]] 4 | 5 | /** 6 | * Copyright (c) 2016-present, Facebook, Inc. 7 | * All rights reserved. 8 | * 9 | * This source code is licensed under the MIT license found in the 10 | * LICENSE file in the root directory of this source tree. 11 | */ 12 | 13 | #include "dictionary.h" 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | namespace fasttext { 25 | 26 | const std::string Dictionary::EOS = ""; 27 | const std::string Dictionary::BOW = "<"; 28 | const std::string Dictionary::EOW = ">"; 29 | 30 | Dictionary::Dictionary(std::shared_ptr args) 31 | : args_(args), 32 | word2int_(MAX_VOCAB_SIZE, -1), 33 | size_(0), 34 | nwords_(0), 35 | nlabels_(0), 36 | ntokens_(0), 37 | pruneidx_size_(-1) {} 38 | 39 | Dictionary::Dictionary(std::shared_ptr args, std::istream& in) 40 | : args_(args), 41 | size_(0), 42 | nwords_(0), 43 | nlabels_(0), 44 | ntokens_(0), 45 | pruneidx_size_(-1) { 46 | load(in); 47 | } 48 | 49 | int32_t Dictionary::find(const std::string& w) const { 50 | return find(w, hash(w)); 51 | } 52 | 53 | int32_t Dictionary::find(const std::string& w, uint32_t h) const { 54 | int32_t word2intsize = word2int_.size(); 55 | int32_t id = h % word2intsize; 56 | while (word2int_[id] != -1 && words_[word2int_[id]].word != w) { 57 | id = (id + 1) % word2intsize; 58 | } 59 | return id; 60 | } 61 | 62 | void Dictionary::add(const std::string& w) { 63 | int32_t h = find(w); 64 | ntokens_++; 65 | if (word2int_[h] == -1) { 66 | entry e; 67 | e.word = w; 68 | e.count = 1; 69 | e.type = getType(w); 70 | words_.push_back(e); 71 | word2int_[h] = size_++; 72 | } else { 73 | words_[word2int_[h]].count++; 74 | } 75 | } 76 | 77 | int32_t Dictionary::nwords() const { 78 | return nwords_; 79 | } 80 | 81 | int32_t Dictionary::nlabels() const { 82 | return nlabels_; 83 | } 84 | 85 | int64_t Dictionary::ntokens() const { 86 | return ntokens_; 87 | } 88 | 89 | const std::vector& Dictionary::getSubwords(int32_t i) const { 90 | assert(i >= 0); 91 | assert(i < nwords_); 92 | return words_[i].subwords; 93 | } 94 | 95 | const std::vector Dictionary::getSubwords( 96 | const std::string& word) const { 97 | int32_t i = getId(word); 98 | if (i >= 0) { 99 | return getSubwords(i); 100 | } 101 | std::vector ngrams; 102 | if (word != EOS) { 103 | computeSubwords(BOW + word + EOW, ngrams); 104 | } 105 | return ngrams; 106 | } 107 | 108 | void Dictionary::getSubwords( 109 | const std::string& word, 110 | std::vector& ngrams, 111 | std::vector& substrings) const { 112 | int32_t i = getId(word); 113 | ngrams.clear(); 114 | substrings.clear(); 115 | if (i >= 0) { 116 | ngrams.push_back(i); 117 | substrings.push_back(words_[i].word); 118 | } 119 | if (word != EOS) { 120 | computeSubwords(BOW + word + EOW, ngrams, &substrings); 121 | } 122 | } 123 | 124 | bool Dictionary::discard(int32_t id, real rand) const { 125 | assert(id >= 0); 126 | assert(id < nwords_); 127 | if (args_->model == model_name::sup) { 128 | return false; 129 | } 130 | return rand > pdiscard_[id]; 131 | } 132 | 133 | int32_t Dictionary::getId(const std::string& w, uint32_t h) const { 134 | int32_t id = find(w, h); 135 | return word2int_[id]; 136 | } 137 | 138 | int32_t Dictionary::getId(const std::string& w) const { 139 | int32_t h = find(w); 140 | return word2int_[h]; 141 | } 142 | 143 | entry_type Dictionary::getType(int32_t id) const { 144 | assert(id >= 0); 145 | assert(id < size_); 146 | return words_[id].type; 147 | } 148 | 149 | entry_type Dictionary::getType(const std::string& w) const { 150 | return (w.find(args_->label) == 0) ? entry_type::label : entry_type::word; 151 | } 152 | 153 | std::string Dictionary::getWord(int32_t id) const { 154 | assert(id >= 0); 155 | assert(id < size_); 156 | return words_[id].word; 157 | } 158 | 159 | // The correct implementation of fnv should be: 160 | // h = h ^ uint32_t(uint8_t(str[i])); 161 | // Unfortunately, earlier version of fasttext used 162 | // h = h ^ uint32_t(str[i]); 163 | // which is undefined behavior (as char can be signed or unsigned). 164 | // Since all fasttext models that were already released were trained 165 | // using signed char, we fixed the hash function to make models 166 | // compatible whatever compiler is used. 167 | uint32_t Dictionary::hash(const std::string& str) const { 168 | uint32_t h = 2166136261; 169 | for (size_t i = 0; i < str.size(); i++) { 170 | h = h ^ uint32_t(int8_t(str[i])); 171 | h = h * 16777619; 172 | } 173 | return h; 174 | } 175 | 176 | void Dictionary::computeSubwords( 177 | const std::string& word, 178 | std::vector& ngrams, 179 | std::vector* substrings) const { 180 | for (size_t i = 0; i < word.size(); i++) { 181 | std::string ngram; 182 | if ((word[i] & 0xC0) == 0x80) { 183 | continue; 184 | } 185 | for (size_t j = i, n = 1; j < word.size() && n <= args_->maxn; n++) { 186 | ngram.push_back(word[j++]); 187 | while (j < word.size() && (word[j] & 0xC0) == 0x80) { 188 | ngram.push_back(word[j++]); 189 | } 190 | if (n >= args_->minn && !(n == 1 && (i == 0 || j == word.size()))) { 191 | int32_t h = hash(ngram) % args_->bucket; 192 | pushHash(ngrams, h); 193 | if (substrings) { 194 | substrings->push_back(ngram); 195 | } 196 | } 197 | } 198 | } 199 | } 200 | 201 | void Dictionary::initNgrams() { 202 | for (size_t i = 0; i < size_; i++) { 203 | std::string word = BOW + words_[i].word + EOW; 204 | words_[i].subwords.clear(); 205 | words_[i].subwords.push_back(i); 206 | if (words_[i].word != EOS) { 207 | computeSubwords(word, words_[i].subwords); 208 | } 209 | } 210 | } 211 | 212 | bool Dictionary::readWord(std::istream& in, std::string& word) const { 213 | int c; 214 | std::streambuf& sb = *in.rdbuf(); 215 | word.clear(); 216 | while ((c = sb.sbumpc()) != EOF) { 217 | if (c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v' || 218 | c == '\f' || c == '\0') { 219 | if (word.empty()) { 220 | if (c == '\n') { 221 | word += EOS; 222 | return true; 223 | } 224 | continue; 225 | } else { 226 | if (c == '\n') 227 | sb.sungetc(); 228 | return true; 229 | } 230 | } 231 | word.push_back(c); 232 | } 233 | // trigger eofbit 234 | in.get(); 235 | return !word.empty(); 236 | } 237 | 238 | void Dictionary::readFromFile(std::istream& in) { 239 | std::string word; 240 | int64_t minThreshold = 1; 241 | while (readWord(in, word)) { 242 | add(word); 243 | if (ntokens_ % 1000000 == 0 && args_->verbose > 1) { 244 | Rcpp::Rcout << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush; 245 | } 246 | if (size_ > 0.75 * MAX_VOCAB_SIZE) { 247 | minThreshold++; 248 | threshold(minThreshold, minThreshold); 249 | } 250 | } 251 | threshold(args_->minCount, args_->minCountLabel); 252 | initTableDiscard(); 253 | initNgrams(); 254 | if (args_->verbose > 0) { 255 | Rcpp::Rcout << "\rRead " << ntokens_ / 1000000 << "M words" << std::endl; 256 | Rcpp::Rcout << "Number of words: " << nwords_ << std::endl; 257 | Rcpp::Rcout << "Number of labels: " << nlabels_ << std::endl; 258 | } 259 | if (size_ == 0) { 260 | throw std::invalid_argument( 261 | "Empty vocabulary. Try a smaller -minCount value."); 262 | } 263 | } 264 | 265 | void Dictionary::threshold(int64_t t, int64_t tl) { 266 | sort(words_.begin(), words_.end(), [](const entry& e1, const entry& e2) { 267 | if (e1.type != e2.type) { 268 | return e1.type < e2.type; 269 | } 270 | return e1.count > e2.count; 271 | }); 272 | words_.erase( 273 | remove_if( 274 | words_.begin(), 275 | words_.end(), 276 | [&](const entry& e) { 277 | return (e.type == entry_type::word && e.count < t) || 278 | (e.type == entry_type::label && e.count < tl); 279 | }), 280 | words_.end()); 281 | words_.shrink_to_fit(); 282 | size_ = 0; 283 | nwords_ = 0; 284 | nlabels_ = 0; 285 | std::fill(word2int_.begin(), word2int_.end(), -1); 286 | for (auto it = words_.begin(); it != words_.end(); ++it) { 287 | int32_t h = find(it->word); 288 | word2int_[h] = size_++; 289 | if (it->type == entry_type::word) { 290 | nwords_++; 291 | } 292 | if (it->type == entry_type::label) { 293 | nlabels_++; 294 | } 295 | } 296 | } 297 | 298 | void Dictionary::initTableDiscard() { 299 | pdiscard_.resize(size_); 300 | for (size_t i = 0; i < size_; i++) { 301 | real f = real(words_[i].count) / real(ntokens_); 302 | pdiscard_[i] = std::sqrt(args_->t / f) + args_->t / f; 303 | } 304 | } 305 | 306 | std::vector Dictionary::getCounts(entry_type type) const { 307 | std::vector counts; 308 | for (auto& w : words_) { 309 | if (w.type == type) { 310 | counts.push_back(w.count); 311 | } 312 | } 313 | return counts; 314 | } 315 | 316 | void Dictionary::addWordNgrams( 317 | std::vector& line, 318 | const std::vector& hashes, 319 | int32_t n) const { 320 | for (int32_t i = 0; i < hashes.size(); i++) { 321 | uint64_t h = hashes[i]; 322 | for (int32_t j = i + 1; j < hashes.size() && j < i + n; j++) { 323 | h = h * 116049371 + hashes[j]; 324 | pushHash(line, h % args_->bucket); 325 | } 326 | } 327 | } 328 | 329 | void Dictionary::addSubwords( 330 | std::vector& line, 331 | const std::string& token, 332 | int32_t wid) const { 333 | if (wid < 0) { // out of vocab 334 | if (token != EOS) { 335 | computeSubwords(BOW + token + EOW, line); 336 | } 337 | } else { 338 | if (args_->maxn <= 0) { // in vocab w/o subwords 339 | line.push_back(wid); 340 | } else { // in vocab w/ subwords 341 | const std::vector& ngrams = getSubwords(wid); 342 | line.insert(line.end(), ngrams.cbegin(), ngrams.cend()); 343 | } 344 | } 345 | } 346 | 347 | void Dictionary::reset(std::istream& in) const { 348 | if (in.eof()) { 349 | in.clear(); 350 | in.seekg(std::streampos(0)); 351 | } 352 | } 353 | 354 | int32_t Dictionary::getLine( 355 | std::istream& in, 356 | std::vector& words, 357 | std::minstd_rand& rng) const { 358 | std::uniform_real_distribution<> uniform(0, 1); 359 | std::string token; 360 | int32_t ntokens = 0; 361 | 362 | reset(in); 363 | words.clear(); 364 | while (readWord(in, token)) { 365 | int32_t h = find(token); 366 | int32_t wid = word2int_[h]; 367 | if (wid < 0) { 368 | continue; 369 | } 370 | 371 | ntokens++; 372 | if (getType(wid) == entry_type::word && !discard(wid, uniform(rng))) { 373 | words.push_back(wid); 374 | } 375 | if (ntokens > MAX_LINE_SIZE || token == EOS) { 376 | break; 377 | } 378 | } 379 | return ntokens; 380 | } 381 | 382 | int32_t Dictionary::getLine( 383 | std::istream& in, 384 | std::vector& words, 385 | std::vector& labels) const { 386 | std::vector word_hashes; 387 | std::string token; 388 | int32_t ntokens = 0; 389 | 390 | reset(in); 391 | words.clear(); 392 | labels.clear(); 393 | while (readWord(in, token)) { 394 | uint32_t h = hash(token); 395 | int32_t wid = getId(token, h); 396 | entry_type type = wid < 0 ? getType(token) : getType(wid); 397 | 398 | ntokens++; 399 | if (type == entry_type::word) { 400 | addSubwords(words, token, wid); 401 | word_hashes.push_back(h); 402 | } else if (type == entry_type::label && wid >= 0) { 403 | labels.push_back(wid - nwords_); 404 | } 405 | if (token == EOS) { 406 | break; 407 | } 408 | } 409 | addWordNgrams(words, word_hashes, args_->wordNgrams); 410 | return ntokens; 411 | } 412 | 413 | void Dictionary::pushHash(std::vector& hashes, int32_t id) const { 414 | if (pruneidx_size_ == 0 || id < 0) { 415 | return; 416 | } 417 | if (pruneidx_size_ > 0) { 418 | if (pruneidx_.count(id)) { 419 | id = pruneidx_.at(id); 420 | } else { 421 | return; 422 | } 423 | } 424 | hashes.push_back(nwords_ + id); 425 | } 426 | 427 | std::string Dictionary::getLabel(int32_t lid) const { 428 | if (lid < 0 || lid >= nlabels_) { 429 | throw std::invalid_argument( 430 | "Label id is out of range [0, " + std::to_string(nlabels_) + "]"); 431 | } 432 | return words_[lid + nwords_].word; 433 | } 434 | 435 | void Dictionary::save(std::ostream& out) const { 436 | out.write((char*)&size_, sizeof(int32_t)); 437 | out.write((char*)&nwords_, sizeof(int32_t)); 438 | out.write((char*)&nlabels_, sizeof(int32_t)); 439 | out.write((char*)&ntokens_, sizeof(int64_t)); 440 | out.write((char*)&pruneidx_size_, sizeof(int64_t)); 441 | for (int32_t i = 0; i < size_; i++) { 442 | entry e = words_[i]; 443 | out.write(e.word.data(), e.word.size() * sizeof(char)); 444 | out.put(0); 445 | out.write((char*)&(e.count), sizeof(int64_t)); 446 | out.write((char*)&(e.type), sizeof(entry_type)); 447 | } 448 | for (const auto pair : pruneidx_) { 449 | out.write((char*)&(pair.first), sizeof(int32_t)); 450 | out.write((char*)&(pair.second), sizeof(int32_t)); 451 | } 452 | } 453 | 454 | void Dictionary::load(std::istream& in) { 455 | words_.clear(); 456 | in.read((char*)&size_, sizeof(int32_t)); 457 | in.read((char*)&nwords_, sizeof(int32_t)); 458 | in.read((char*)&nlabels_, sizeof(int32_t)); 459 | in.read((char*)&ntokens_, sizeof(int64_t)); 460 | in.read((char*)&pruneidx_size_, sizeof(int64_t)); 461 | for (int32_t i = 0; i < size_; i++) { 462 | char c; 463 | entry e; 464 | while ((c = in.get()) != 0) { 465 | e.word.push_back(c); 466 | } 467 | in.read((char*)&e.count, sizeof(int64_t)); 468 | in.read((char*)&e.type, sizeof(entry_type)); 469 | words_.push_back(e); 470 | } 471 | pruneidx_.clear(); 472 | for (int32_t i = 0; i < pruneidx_size_; i++) { 473 | int32_t first; 474 | int32_t second; 475 | in.read((char*)&first, sizeof(int32_t)); 476 | in.read((char*)&second, sizeof(int32_t)); 477 | pruneidx_[first] = second; 478 | } 479 | initTableDiscard(); 480 | initNgrams(); 481 | 482 | int32_t word2intsize = std::ceil(size_ / 0.7); 483 | word2int_.assign(word2intsize, -1); 484 | for (int32_t i = 0; i < size_; i++) { 485 | word2int_[find(words_[i].word)] = i; 486 | } 487 | } 488 | 489 | void Dictionary::init() { 490 | initTableDiscard(); 491 | initNgrams(); 492 | } 493 | 494 | void Dictionary::prune(std::vector& idx) { 495 | std::vector words, ngrams; 496 | for (auto it = idx.cbegin(); it != idx.cend(); ++it) { 497 | if (*it < nwords_) { 498 | words.push_back(*it); 499 | } else { 500 | ngrams.push_back(*it); 501 | } 502 | } 503 | std::sort(words.begin(), words.end()); 504 | idx = words; 505 | 506 | if (ngrams.size() != 0) { 507 | int32_t j = 0; 508 | for (const auto ngram : ngrams) { 509 | pruneidx_[ngram - nwords_] = j; 510 | j++; 511 | } 512 | idx.insert(idx.end(), ngrams.begin(), ngrams.end()); 513 | } 514 | pruneidx_size_ = pruneidx_.size(); 515 | 516 | std::fill(word2int_.begin(), word2int_.end(), -1); 517 | 518 | int32_t j = 0; 519 | for (int32_t i = 0; i < words_.size(); i++) { 520 | if (getType(i) == entry_type::label || 521 | (j < words.size() && words[j] == i)) { 522 | words_[j] = words_[i]; 523 | word2int_[find(words_[j].word)] = j; 524 | j++; 525 | } 526 | } 527 | nwords_ = words.size(); 528 | size_ = nwords_ + nlabels_; 529 | words_.erase(words_.begin() + size_, words_.end()); 530 | initNgrams(); 531 | } 532 | 533 | void Dictionary::dump(std::ostream& out) const { 534 | out << words_.size() << std::endl; 535 | for (auto it : words_) { 536 | std::string entryType = "word"; 537 | if (it.type == entry_type::label) { 538 | entryType = "label"; 539 | } 540 | out << it.word << " " << it.count << " " << entryType << std::endl; 541 | } 542 | } 543 | 544 | } // namespace fasttext 545 | -------------------------------------------------------------------------------- /tests/testthat/test-fasttext.R: -------------------------------------------------------------------------------- 1 | 2 | #--------------------------------------------------------------------------------------- data 3 | path_read = file.path(getwd(), "example_text.txt") 4 | default_write_path = file.path(getwd(), 'save_model_vecs') 5 | path_write_vecs = file.path(default_write_path, 'word_vectors') 6 | path_write_logs = file.path(default_write_path, 'model_logs.txt') 7 | path_supervised = file.path(getwd(), 'cooking_supervised.txt') 8 | path_lang_identify = file.path(getwd(), 'declaration_human_rights_english.txt') 9 | pre_train_ftz = system.file("language_identification/lid.176.ftz", package = "fastText") 10 | #--------------------------------------------------------------------------------------- 11 | 12 | 13 | context('tests for all functions') 14 | 15 | 16 | #========================= 17 | # print usage of functions 18 | #========================= 19 | 20 | 21 | testthat::test_that("it prints information for the 'printDumpUsage' function", { 22 | 23 | testthat::expect_output( printDumpUsage() ) 24 | }) 25 | 26 | 27 | testthat::test_that("it prints information for the 'printNNUsage' function", { 28 | 29 | testthat::expect_output( printNNUsage() ) 30 | }) 31 | 32 | 33 | testthat::test_that("it prints information for the 'printPredictUsage' function", { 34 | 35 | testthat::expect_output( printPredictUsage() ) 36 | }) 37 | 38 | 39 | testthat::test_that("it prints information for the 'printPrintNgramsUsage' function", { 40 | 41 | testthat::expect_output( printPrintNgramsUsage() ) 42 | }) 43 | 44 | 45 | testthat::test_that("it prints information for the 'printPrintSentenceVectorsUsage' function", { 46 | 47 | testthat::expect_output( printPrintSentenceVectorsUsage() ) 48 | }) 49 | 50 | 51 | testthat::test_that("it prints information for the 'printPrintWordVectorsUsage' function", { 52 | 53 | testthat::expect_output( printPrintWordVectorsUsage() ) 54 | }) 55 | 56 | 57 | testthat::test_that("it prints information for the 'printQuantizeUsage' function", { 58 | 59 | testthat::expect_output( printQuantizeUsage() ) 60 | }) 61 | 62 | 63 | testthat::test_that("it prints information for the 'printTestLabelUsage' function", { 64 | 65 | testthat::expect_output( printTestLabelUsage() ) 66 | }) 67 | 68 | 69 | testthat::test_that("it prints information for the 'printTestUsage' function", { 70 | 71 | testthat::expect_output( printTestUsage() ) 72 | }) 73 | 74 | 75 | testthat::test_that("it prints information for the 'printUsage' function", { 76 | 77 | testthat::expect_output( printUsage() ) 78 | }) 79 | 80 | 81 | testthat::test_that("it prints information about the parameters of a specified command", { 82 | 83 | testthat::expect_output( print_parameters(command = 'supervised') ) 84 | }) 85 | 86 | 87 | #============================== 88 | # 'fasttext_interface' function [ 'expect_true' and 'expect_output' ] 89 | #============================== 90 | 91 | 92 | testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'cbow' command", { 93 | 94 | list_params = list(command = 'cbow', 95 | lr = 0.1, 96 | dim = 5, 97 | input = path_read, 98 | output = path_write_vecs, 99 | verbose = 2, 100 | thread = 1) 101 | 102 | res = fasttext_interface(list_params, 103 | path_output = path_write_logs, 104 | MilliSecs = 100) 105 | 106 | out = list.files(default_write_path, full.names = F) 107 | 108 | testthat::expect_true( length(out) == 4 && all(out %in% c("DONT_DELETE_THIS_FILE.txt", "model_logs.txt", "word_vectors.bin", "word_vectors.vec")) ) 109 | }) 110 | 111 | 112 | testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'skipgram' command", { 113 | 114 | list_params = list(command = 'skipgram', 115 | lr = 0.1, 116 | dim = 5, 117 | input = path_read, 118 | output = path_write_vecs, 119 | verbose = 2, 120 | thread = 1) 121 | 122 | res = fasttext_interface(list_params, 123 | path_output = path_write_logs, 124 | MilliSecs = 100) 125 | 126 | out = list.files(default_write_path, full.names = F) 127 | 128 | testthat::expect_true( length(out) == 4 && all(out %in% c("DONT_DELETE_THIS_FILE.txt", "model_logs.txt", "word_vectors.bin", "word_vectors.vec")) ) 129 | }) 130 | 131 | 132 | testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'supervised' command", { 133 | 134 | list_params = list(command = 'supervised', 135 | lr = 0.1, 136 | dim = 5, 137 | input = path_supervised, 138 | output = path_write_vecs, 139 | verbose = 2, 140 | thread = 1) 141 | 142 | res = fasttext_interface(list_params, 143 | path_output = path_write_logs, 144 | MilliSecs = 100) 145 | 146 | out = list.files(default_write_path, full.names = F) 147 | 148 | testthat::expect_true( length(out) == 4 && all(out %in% c("DONT_DELETE_THIS_FILE.txt", "model_logs.txt", "word_vectors.bin", "word_vectors.vec")) ) 149 | }) 150 | 151 | 152 | testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'predict' and 'predict-prob' command", { 153 | 154 | list_params = list(command = 'predict', 155 | model = file.path(default_write_path, 'word_vectors.bin'), 156 | test_data = file.path(getwd(), 'cooking_valid.txt'), 157 | k = 1, 158 | th = 0.0) 159 | 160 | res = fasttext_interface(list_params, path_output = file.path(default_write_path, 'preds_valid.txt')) 161 | 162 | out_preds = list.files(default_write_path, full.names = F) 163 | out_preds = ('preds_valid.txt' %in% out_preds) 164 | read_preds_valid = utils::read.table(file.path(default_write_path, 'preds_valid.txt'), quote="\"", comment.char="") 165 | ncol_valid = ncol(read_preds_valid) == 1 # single column output 166 | 167 | list_params = list(command = 'predict-prob', 168 | model = file.path(default_write_path, 'word_vectors.bin'), 169 | test_data = file.path(getwd(), 'cooking_valid.txt'), 170 | k = 1, 171 | th = 0.0) 172 | 173 | res = fasttext_interface(list_params, path_output = file.path(default_write_path, 'preds_valid.txt')) 174 | 175 | out_preds_prob = list.files(default_write_path, full.names = F) 176 | out_preds_prob = ('preds_valid.txt' %in% out_preds_prob) 177 | read_preds_valid = utils::read.table(file.path(default_write_path, 'preds_valid.txt'), quote="\"", comment.char="") 178 | ncol_valid_prob = ncol(read_preds_valid) == 2 # 2-column output (probabilities, too) 179 | 180 | testthat::expect_true( all(c(out_preds, out_preds_prob)) && all(ncol_valid, ncol_valid_prob) ) 181 | }) 182 | 183 | 184 | 185 | testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'test-label' command", { 186 | 187 | list_params = list(command = 'test-label', 188 | model = file.path(default_write_path, 'word_vectors.bin'), 189 | test_data = file.path(getwd(), 'cooking_valid.txt'), 190 | k = 5, 191 | th = 0.0) 192 | 193 | res = fasttext_interface(list_params, path_output = file.path(default_write_path, 'preds_valid.txt')) 194 | 195 | out_preds_prob = list.files(default_write_path, full.names = F) 196 | out_preds_prob = ('preds_valid.txt' %in% out_preds_prob) 197 | read_preds_valid = utils::read.table(file.path(default_write_path, 'preds_valid.txt'), quote="\"", comment.char="") 198 | ncol_valid_prob = ncol(read_preds_valid) == 10 # 10-column output (precision & recall, too) 199 | 200 | testthat::expect_true( ncol_valid_prob ) 201 | }) 202 | 203 | 204 | 205 | testthat::test_that("the 'fasttext_interface' function prints information to the R session (precision, recall) when using the 'test' command", { 206 | 207 | list_params = list(command = 'test', 208 | model = file.path(default_write_path, 'word_vectors.bin'), 209 | test_data = file.path(getwd(), 'cooking_valid.txt'), 210 | k = 5, 211 | th = 0.0) # it prints precision, recall to the R session (only) 212 | 213 | testthat::expect_output( fasttext_interface(list_params) ) 214 | }) 215 | 216 | 217 | testthat::test_that("the 'fasttext_interface' function will create an .ftz file when using the 'quantize' command", { 218 | 219 | pth_in_bin = file.path(default_write_path, 'word_vectors.bin') 220 | pth_out_ftz = file.path(default_write_path, 'word_vectors.ftz') 221 | 222 | list_params = list(command = 'quantize', 223 | input = pth_in_bin, 224 | output = pth_out_ftz) 225 | 226 | res = fasttext_interface(list_params) 227 | 228 | ftz_exists = file.exists(pth_out_ftz) 229 | ftz_smaller_size_than_bin = (file.size(pth_in_bin) > file.size(pth_out_ftz)) 230 | 231 | testthat::expect_true( ftz_exists & ftz_smaller_size_than_bin ) 232 | }) 233 | 234 | 235 | testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'print-word-vectors' command", { 236 | 237 | list_params = list(command = 'print-word-vectors', 238 | model = file.path(default_write_path, 'word_vectors.bin')) 239 | 240 | out_data = file.path(default_write_path, 'preds_valid.txt') 241 | 242 | res = fasttext_interface(list_params, 243 | path_input = file.path(getwd(), 'queries.txt'), 244 | path_output = out_data) 245 | 246 | read_word_vecs = utils::read.table(out_data, quote="\"", comment.char="") 247 | 248 | testthat::expect_true( nrow(read_word_vecs) == 5 && ncol(read_word_vecs) == 6 ) 249 | }) 250 | 251 | 252 | testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'print-sentence-vectors' command", { 253 | 254 | list_params = list(command = 'print-sentence-vectors', 255 | model = file.path(default_write_path, 'word_vectors.bin')) 256 | 257 | out_data = file.path(default_write_path, 'preds_valid.txt') 258 | 259 | res = fasttext_interface(list_params, 260 | path_input = file.path(getwd(), 'text_sentence.txt'), 261 | path_output = out_data) 262 | 263 | read_word_vecs = utils::read.table(out_data, quote="\"", comment.char="") # the 3rd and 4th rows must give the same output because they are the same sentences 264 | 265 | testthat::expect_true( nrow(read_word_vecs) == 5 && ncol(read_word_vecs) == 5 && all(read_word_vecs[3, ] == read_word_vecs[4, ]) ) 266 | }) 267 | 268 | 269 | 270 | testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'print-ngrams' command", { 271 | 272 | list_params = list(command = 'skipgram', 273 | lr = 0.1, 274 | dim = 5, 275 | input = path_read, 276 | output = path_write_vecs, 277 | verbose = 2, 278 | thread = 1, 279 | minn = 2, 280 | maxn = 2) 281 | 282 | res = fasttext_interface(list_params, path_output = path_write_logs, MilliSecs = 100) 283 | 284 | list_params = list(command = 'print-ngrams', 285 | model = file.path(default_write_path, 'word_vectors.bin'), 286 | word = 'word') 287 | 288 | out_data = file.path(default_write_path, 'preds_valid.txt') 289 | 290 | res = fasttext_interface(list_params, path_output = out_data) 291 | 292 | read_ngrams = utils::read.table(out_data, quote="\"", comment.char="") 293 | 294 | testthat::expect_true( all(dim(read_ngrams) == c(5, 6)) ) # 'print-ngrams' prints to R session too, just use : res = fasttext_interface(list_params, path_output = "") 295 | }) 296 | 297 | 298 | 299 | testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'nn' command", { 300 | 301 | list_params = list(command = 'nn', 302 | model = file.path(default_write_path, 'word_vectors.bin'), 303 | k = 5, 304 | query_word = 'word') 305 | 306 | out_data = file.path(default_write_path, 'preds_valid.txt') 307 | 308 | res = fasttext_interface(list_params, path_output = out_data) 309 | 310 | read_nn = utils::read.table(out_data, quote="\"", comment.char="") 311 | 312 | testthat::expect_true( all(nrow(read_nn) == list_params[['k']] && ncol(read_nn) == 2) ) 313 | }) 314 | 315 | 316 | 317 | testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'analogies' command", { 318 | 319 | list_params = list(command = 'analogies', 320 | model = file.path(default_write_path, 'word_vectors.bin'), 321 | k = 5) 322 | 323 | out_data = file.path(default_write_path, 'preds_valid.txt') 324 | 325 | res = fasttext_interface(list_params, path_input = file.path(getwd(), 'analogy_queries.txt'), path_output = out_data) 326 | 327 | # the 'analogy_queries.txt' file contains 4 triplets and I'm looking for 5 analogies for each triplet. 328 | # therefore the output file should contain : 4 * 5 + 4 = 24 rows ( I've added a 4 because after each k-analogies I've added a empty line ) 329 | 330 | read_analogies = utils::read.table(out_data, quote="\"", comment.char="", blank.lines.skip = FALSE) 331 | 332 | testthat::expect_true( all(nrow(read_analogies) == (4 * 5 + 4) && ncol(read_analogies) == 2) ) 333 | }) 334 | 335 | 336 | testthat::test_that("the 'fasttext_interface' function writes output to folder when using the 'dump' command", { 337 | 338 | list_params = list(command = 'dump', 339 | model = file.path(default_write_path, 'word_vectors.bin'), 340 | option = 'args') 341 | 342 | out_data = file.path(default_write_path, 'preds_valid.txt') 343 | 344 | res = fasttext_interface(list_params, path_output = out_data, remove_previous_file = TRUE) 345 | 346 | read_dump = utils::read.table(out_data, quote="\"", comment.char="") 347 | 348 | testthat::expect_true( all(dim(read_dump) == c(13, 2)) ) 349 | }) 350 | 351 | 352 | #=================================== 353 | # 'language_identification' function 354 | #=================================== 355 | 356 | 357 | testthat::test_that("the 'language_identification' function gives an error if the 'input_obj' parameter is neither a character vector consisting of character string(s) nor a valid path to a file", { 358 | 359 | lst_invalid = list(1,2,3) 360 | 361 | testthat::expect_error( language_identification(input_obj = lst_invalid, 362 | pre_trained_language_model_path = pre_train_ftz, 363 | k = 1, 364 | th = 0.0, 365 | verbose = TRUE) ) 366 | }) 367 | 368 | 369 | testthat::test_that("the 'language_identification' function gives an error if the 'pre_trained_language_model_path' parameter does not point to a valid pre-trained weights file", { 370 | 371 | vec_txt = c("Incapaz de distinguir la luna y la cara de esta chica, Las estrellas se ponen nerviosas en el cielo.", 372 | "Unable to tell apart the moon and this girl's face, Stars are flustered up in the sky.") 373 | 374 | file_pretrained = 'INVALID_pre_trained_weights' 375 | 376 | testthat::expect_error( language_identification(input_obj = vec_txt, 377 | pre_trained_language_model_path = file_pretrained, 378 | k = 1, 379 | th = 0.0, 380 | verbose = TRUE) ) 381 | }) 382 | 383 | 384 | testthat::test_that("the 'language_identification' function returns the correct output when it takes a character vector of character strings as input", { 385 | 386 | vec_txt = c("Incapaz de distinguir la luna y la cara de esta chica, Las estrellas se ponen nerviosas en el cielo.", 387 | "Unable to tell apart the moon and this girl's face, Stars are flustered up in the sky.") 388 | 389 | res_out = language_identification(input_obj = vec_txt, 390 | pre_trained_language_model_path = pre_train_ftz, 391 | k = 1, 392 | th = 0.0, 393 | verbose = TRUE) 394 | 395 | testthat::expect_true( inherits(res_out, 'data.table') & nrow(res_out) == 2 ) 396 | }) 397 | 398 | 399 | testthat::test_that("the 'language_identification' function returns the correct output when it takes a valid path to a text file as input", { 400 | 401 | res_out = language_identification(input_obj = path_lang_identify, 402 | pre_trained_language_model_path = pre_train_ftz, 403 | k = 1, 404 | th = 0.0, 405 | verbose = TRUE) 406 | 407 | testthat::expect_true( inherits(res_out, 'data.table') & nrow(res_out) > 1 & length(unique(res_out$iso_lang_1)) >= 1) 408 | }) 409 | 410 | 411 | testthat::test_that("the 'language_identification' function returns the correct output if the input object is 'data' (see Github issue https://github.com/mlampros/fastText/issues/3)", { 412 | 413 | res_out = language_identification(input_obj = "data", 414 | pre_trained_language_model_path = pre_train_ftz, 415 | k = 1, 416 | th = 0.0, 417 | verbose = TRUE) 418 | 419 | testthat::expect_true( inherits(res_out, 'data.table') & nrow(res_out) == 1 ) 420 | }) 421 | 422 | --------------------------------------------------------------------------------