├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── CONTRIBUTING.md ├── Contributor_Code_of_Conduct.md ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS.md ├── R ├── download_checkpoint.R ├── extract_features.R ├── modeling.R ├── optimization.R ├── run_classifier.R ├── sysdata.rda ├── tokenization.R └── utils.R ├── RBERT.Rproj ├── README.Rmd ├── README.md ├── appveyor.yml ├── codecov.yml ├── data-raw └── sysdata.R ├── man ├── AdamWeightDecayOptimizer.Rd ├── BasicTokenizer.Rd ├── BertConfig.Rd ├── BertModel.Rd ├── FullTokenizer.Rd ├── InputExample.Rd ├── InputExample_EF.Rd ├── InputFeatures.Rd ├── WordpieceTokenizer.Rd ├── apply_to_chars.Rd ├── assert_rank.Rd ├── attention_layer.Rd ├── bert_config_from_json_file.Rd ├── check_vocab.Rd ├── clean_text.Rd ├── convert_by_vocab.Rd ├── convert_examples_to_features.Rd ├── convert_single_example.Rd ├── convert_to_unicode.Rd ├── create_attention_mask_from_input_mask.Rd ├── create_initializer.Rd ├── create_model.Rd ├── create_optimizer.Rd ├── dot-InputFeatures_EF.Rd ├── dot-choose_BERT_dir.Rd ├── dot-convert_examples_to_features_EF.Rd ├── dot-convert_single_example_EF.Rd ├── dot-download_BERT_checkpoint.Rd ├── dot-get_actual_index.Rd ├── dot-get_model_archive_path.Rd ├── dot-get_model_archive_type.Rd ├── dot-get_model_subdir.Rd ├── dot-get_model_url.Rd ├── dot-has_checkpoint.Rd ├── dot-infer_archive_type.Rd ├── dot-infer_checkpoint_archive_path.Rd ├── dot-infer_ckpt_dir.Rd ├── dot-infer_model_paths.Rd ├── dot-maybe_download_checkpoint.Rd ├── dot-model_fn_builder_EF.Rd ├── dot-process_BERT_checkpoint.Rd ├── download_BERT_checkpoint.Rd ├── dropout.Rd ├── embedding_lookup.Rd ├── embedding_postprocessor.Rd ├── extract_features.Rd ├── figures │ ├── rbert_hex.png │ └── rbert_hex.svg ├── file_based_convert_examples_to_features.Rd ├── file_based_input_fn_builder.Rd ├── find_files.Rd ├── gelu.Rd ├── get_activation.Rd ├── get_assignment_map_from_checkpoint.Rd ├── get_shape_list.Rd ├── input_fn_builder.Rd ├── input_fn_builder_EF.Rd ├── is_chinese_char.Rd ├── is_control.Rd ├── is_punctuation.Rd ├── is_whitespace.Rd ├── layer_norm.Rd ├── layer_norm_and_dropout.Rd ├── load_vocab.Rd ├── make_examples_simple.Rd ├── model_fn_builder.Rd ├── reshape_from_matrix.Rd ├── reshape_to_matrix.Rd ├── set_BERT_dir.Rd ├── split_on_punc.Rd ├── strip_accents.Rd ├── tokenize.Rd ├── tokenize_chinese_chars.Rd ├── tokenize_text.Rd ├── tokenize_word.Rd ├── transformer_model.Rd ├── transpose_for_scores.Rd ├── truncate_seq_pair.Rd └── whitespace_tokenize.Rd ├── tests ├── testthat.R └── testthat │ ├── attention_probs.rds │ ├── bert_config.json │ ├── sample_amap.rds │ ├── sample_feat_in.rds │ ├── sample_feats.rds │ ├── setup.R │ ├── teardown.R │ ├── test-download_checkpoint.R │ ├── test-extract_features.R │ ├── test-modeling.R │ ├── test-optimization.R │ ├── test-run_classifier.R │ ├── test-tokenization.R │ ├── test_checkpoints │ ├── .gitignore │ └── testing_checkpoint.tar.gz │ ├── tokenizer.rds │ ├── vocab.txt │ ├── vocab0.txt │ └── vocab_small.txt └── vignettes ├── .gitignore ├── BERT_basics.Rmd └── RBERT_intro.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^CONTRIBUTING\.md$ 4 | ^LICENSE\.md$ 5 | ^Contributor_Code_of_Conduct\.md$ 6 | ^README\.Rmd$ 7 | ^\.travis\.yml$ 8 | ^appveyor\.yml$ 9 | ^codecov\.yml$ 10 | ^data-raw$ 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | vocab_small.txt 6 | vocab.txt 7 | inst/doc 8 | .httr-oauth 9 | README.html 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | warnings_are_errors: false 5 | 6 | env: 7 | global: 8 | - TENSORFLOW_TEST_EXAMPLES="1" 9 | 10 | cache: 11 | packages: true 12 | directories: 13 | - $HOME/.cache/pip 14 | 15 | addons: 16 | apt: 17 | packages: 18 | python3-dev 19 | python3-pip 20 | python3-virtualenv 21 | python3-venv 22 | 23 | before_script: 24 | - python3 -m pip install --upgrade --ignore-installed --user travis virtualenv 25 | - R CMD INSTALL . 26 | - R -e 'tensorflow::install_tensorflow(version = "1.11.0", extra_packages="IPython")' 27 | - R -e 'tensorflow::tf_config()' 28 | 29 | after_success: 30 | - Rscript -e 'covr::codecov()' 31 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing 2 | 3 | Want to contribute? Great! First, read this page (including the small print at the end). 4 | 5 | ### Before you contribute 6 | Before we can use your code, you must sign the Macmillan Learning Individual Contributor License Agreement (CLA), which you will be asked to do automatically when you submit a pull request. The CLA is necessary mainly because you own the copyright to your changes, even after your contribution becomes part of our codebase, so we need your permission to use and distribute your code. We also need to be sure of various other things—for instance that you'll tell us if you know that your code infringes on other people's patents. You don't have to sign the CLA until after you've submitted your code for review and a member has approved it, but you must do it before we can put your code into our codebase. 7 | Before you start working on a larger contribution, you should get in touch with us first through the issue tracker with your idea so that we can help out and possibly guide you. Coordinating up front makes it much easier to avoid frustration later on. 8 | 9 | ### Making changes 10 | 11 | We use the github [fork and pull review process](https://help.github.com/articles/using-pull-requests) to review all contributions. First, fork the repository by following the [github instructions](https://help.github.com/articles/fork-a-repo). Then check out your personal fork: 12 | 13 | $ git clone https://github.com//RBERT.git 14 | 15 | Add an upstream remote so you can easily keep up to date with the main repository: 16 | 17 | $ git remote add upstream https://github.com/jonathanbratt/RBERT.git 18 | 19 | To update your local repo from the main: 20 | 21 | $ git pull upstream master 22 | 23 | When you're done making changes, make sure tests pass, and then commit your changes to your personal fork. Then use the GitHub Web UI to create and send the pull request. We'll review and merge the change. 24 | 25 | 26 | ### Code review 27 | 28 | All submissions, including submissions by project members, require review. To keep the code base maintainable and readable, all code is developed using a similar coding style. We typically follow the [tidyverse style guide](https://style.tidyverse.org/), with minor changes. 29 | 30 | 31 | The code should be easy to maintain and understand. It is important that you be able to come back, months later, to code that you've written and still quickly understand what it is supposed to be doing. Understandable code also makes it easier for other people to contribute. Quick-and-dirty solutions or "clever" coding tricks might work in the short term, but should be avoided in the interest of long term code quality. 32 | 33 | With the code review process, we ensure that at least two sets of eyes looked over the code in hopes of finding potential bugs or errors (before they become bugs and errors). This also improves the overall code quality and makes sure that every developer knows to (largely) expect the same coding style. 34 | 35 | 36 | [Unit tests](https://testthat.r-lib.org/) are an important part of the code. We aim for 100% test coverage, while recognizing that some functionality may be hard to cover in a unit test. 37 | 38 | 39 | 40 | ### The small print 41 | 42 | Contributions made by corporations will be covered by a 43 | different agreement than the one above. Contact us if this applies to you. 44 | -------------------------------------------------------------------------------- /Contributor_Code_of_Conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at jon.harmon@macmillan.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: RBERT 2 | Type: Package 3 | Title: R Implementation of BERT 4 | Version: 0.1.11 5 | Authors@R: c( 6 | person("Jonathan", "Bratt", email = "jonathan.bratt@macmillan.com", 7 | role = c("aut", "cre") 8 | ), 9 | person("Jon", "Harmon", email = "jon.harmon@macmillan.com", 10 | role = c("aut") 11 | ), 12 | person(family = "Google Inc.", role = c("ctb", "cph"), 13 | comment = "Original Python code; Examples and Tutorials") 14 | ) 15 | Description: Use pretrained models from Google Research's BERT in R. 16 | Encoding: UTF-8 17 | LazyData: true 18 | URL: https://github.com/jonathanbratt/RBERT 19 | BugReports: https://github.com/jonathanbratt/RBERT/issues 20 | Depends: R (>= 3.5.1) 21 | License: file LICENSE 22 | RoxygenNote: 7.0.2 23 | Imports: 24 | dplyr (>= 0.8.3), 25 | jsonlite (>= 1.6), 26 | purrr (>= 0.3.0), 27 | rappdirs (>= 0.3.1), 28 | reticulate (>= 1.12), 29 | stringi (>= 1.2.4), 30 | stringr (>= 1.4.0), 31 | tensorflow (>= 1.10), 32 | tibble (>= 2.1.3), 33 | tidyr (>= 1.0.0), 34 | utils 35 | Suggests: 36 | testthat (>= 2.1.0), 37 | mockery, 38 | knitr, 39 | rmarkdown, 40 | covr 41 | VignetteBuilder: knitr 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(tokenize,BasicTokenizer) 4 | S3method(tokenize,FullTokenizer) 5 | S3method(tokenize,WordpieceTokenizer) 6 | export(AdamWeightDecayOptimizer) 7 | export(BasicTokenizer) 8 | export(BertConfig) 9 | export(BertModel) 10 | export(FullTokenizer) 11 | export(InputExample) 12 | export(InputExample_EF) 13 | export(InputFeatures) 14 | export(WordpieceTokenizer) 15 | export(assert_rank) 16 | export(attention_layer) 17 | export(bert_config_from_json_file) 18 | export(check_vocab) 19 | export(convert_by_vocab) 20 | export(convert_examples_to_features) 21 | export(convert_ids_to_tokens) 22 | export(convert_single_example) 23 | export(convert_to_unicode) 24 | export(convert_tokens_to_ids) 25 | export(create_attention_mask_from_input_mask) 26 | export(create_initializer) 27 | export(create_model) 28 | export(create_optimizer) 29 | export(download_BERT_checkpoint) 30 | export(dropout) 31 | export(embedding_lookup) 32 | export(embedding_postprocessor) 33 | export(extract_features) 34 | export(file_based_convert_examples_to_features) 35 | export(file_based_input_fn_builder) 36 | export(find_ckpt) 37 | export(find_config) 38 | export(find_vocab) 39 | export(gelu) 40 | export(get_activation) 41 | export(get_assignment_map_from_checkpoint) 42 | export(get_shape_list) 43 | export(input_fn_builder) 44 | export(layer_norm) 45 | export(layer_norm_and_dropout) 46 | export(load_vocab) 47 | export(make_examples_simple) 48 | export(model_fn_builder) 49 | export(reshape_from_matrix) 50 | export(reshape_to_matrix) 51 | export(set_BERT_dir) 52 | export(tokenize) 53 | export(tokenize_text) 54 | export(tokenize_word) 55 | export(transformer_model) 56 | export(truncate_seq_pair) 57 | export(whitespace_tokenize) 58 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # RBERT 0.1.11 2 | 3 | * Added parameter to shush verbose `extract_features`. 4 | * Removed vestigal `use_one_hot_embeddings` parameter from everywhere. 5 | * `extract_features` now can take plain character vectors as input. 6 | * `extract_features` now can take a single checkpoint directory or model name, 7 | rather than three separate file paths. 8 | 9 | # RBERT 0.1.7 10 | 11 | * Updated `extract_features` to return tidy tibbles (@jonthegeek, #29) 12 | 13 | # RBERT 0.1.6 14 | 15 | * Updated `download_BERT_checkpoint` to simplify usage. (@jonthegeek, #25) 16 | 17 | # RBERT 0.1.0 18 | 19 | * Added a `NEWS.md` file to track changes to the package. 20 | * Initial open source release. 21 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/R/sysdata.rda -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # find checkpoint files --------------------------------------------------- 16 | 17 | 18 | #' Find Checkpoint Files 19 | #' 20 | #' Given the path to a checkpoint directory, return the paths to certain files 21 | #' in that directory. 22 | #' 23 | #' @param ckpt_dir Character; the path to the checkpoint directory. If this 24 | #' argument is NULL, the associated functions also return NULL. 25 | #' @name find_files 26 | NULL 27 | 28 | #' @describeIn find_files Find the vocabulary file ('vocab.txt'). 29 | #' @export 30 | find_vocab <- function(ckpt_dir) { 31 | # If this gets called for a NULL ckpt_dir, return NULL. 32 | if (is.null(ckpt_dir)) { 33 | return(NULL) 34 | } 35 | 36 | vocab_file <- file.path(ckpt_dir, "vocab.txt") 37 | if (file.exists(vocab_file)) { 38 | return(vocab_file) 39 | } else { 40 | stop("No file named 'vocab.txt' found in ", ckpt_dir) # nocov 41 | } 42 | } 43 | 44 | 45 | #' @describeIn find_files Find the config file ('bert_config.json'). 46 | #' @export 47 | find_config <- function(ckpt_dir) { 48 | # If this gets called for a NULL ckpt_dir, return NULL. 49 | if (is.null(ckpt_dir)) { 50 | return(NULL) 51 | } 52 | 53 | config_file <- file.path(ckpt_dir, "bert_config.json") 54 | if (file.exists(config_file)) { 55 | return(config_file) 56 | } else { 57 | stop("No file named 'bert_config.json' found in ", ckpt_dir) # nocov 58 | } 59 | } 60 | 61 | #' @describeIn find_files Find the checkpoint file stub (files begin with 62 | #' 'bert_model.ckpt'). 63 | #' @export 64 | find_ckpt <- function(ckpt_dir) { 65 | # If this gets called for a NULL ckpt_dir, return NULL. 66 | if (is.null(ckpt_dir)) { 67 | return(NULL) 68 | } 69 | 70 | # The path we want to return here isn't an actual file, but a name stub for 71 | # files with suffixes '.index', '.meta', etc. 72 | ckpt_filestub <- file.path(ckpt_dir, "bert_model.ckpt") 73 | ckpt_file1 <- file.path(ckpt_dir, "bert_model.ckpt.index") 74 | ckpt_file2 <- file.path(ckpt_dir, "bert_model.ckpt.meta") 75 | if (file.exists(ckpt_file1) & file.exists(ckpt_file2)) { 76 | return(ckpt_filestub) 77 | } else { 78 | stop("Checkpoint file(s) missing from ", ckpt_dir) # nocov 79 | } 80 | } 81 | 82 | #' Find Paths to Checkpoint Files 83 | #' 84 | #' In some functions, the user can specify a model, a ckpt_dir, and/or specific 85 | #' paths to checkpoint files. This function sorts all of that out. 86 | #' 87 | #' @inheritParams extract_features 88 | #' @return A list with components vocab_file, bert_config_file, and 89 | #' init_checkpoint. 90 | #' @keywords internal 91 | .infer_model_paths <- function(model = c( 92 | "bert_base_uncased", 93 | "bert_base_cased", 94 | "bert_large_uncased", 95 | "bert_large_cased", 96 | "bert_large_uncased_wwm", 97 | "bert_large_cased_wwm", 98 | "bert_base_multilingual_cased", 99 | "bert_base_chinese", 100 | "scibert_scivocab_uncased", 101 | "scibert_scivocab_cased", 102 | "scibert_basevocab_uncased", 103 | "scibert_basevocab_cased" 104 | ), 105 | ckpt_dir = NULL, 106 | vocab_file = find_vocab(ckpt_dir), 107 | bert_config_file = find_config(ckpt_dir), 108 | init_checkpoint = find_ckpt(ckpt_dir)) { 109 | # Deal with the fact that model will never be *missing* when this function is 110 | # called, but we don't want the calling functions to have to deal with parsing 111 | # the argument. 112 | if (length(model) > 1) { 113 | model <- NULL 114 | } else { 115 | model <- match.arg(model) 116 | } 117 | 118 | # If any of the necessary files aren't specified, try to find them. This would 119 | # most likely only happen if they specified one file but not all (and left 120 | # ckpt_dir as NULL), but run this to be sure. 121 | vocab_file <- vocab_file %||% find_vocab(ckpt_dir) 122 | bert_config_file <- bert_config_file %||% find_config(ckpt_dir) 123 | init_checkpoint <- init_checkpoint %||% find_ckpt(ckpt_dir) 124 | 125 | # At this point either we have the paths, or we need to infer from the model. 126 | if ((is.null(vocab_file) | 127 | is.null(bert_config_file) | 128 | is.null(init_checkpoint))) { 129 | if (is.null(model)) { 130 | stop( 131 | "You must specify a model, a ckpt_dir, or the locations of ", 132 | "vocab_file, bert_config_file, and init_checkpoint." 133 | ) 134 | } else { 135 | dir <- .choose_BERT_dir(NULL) 136 | ckpt_dir <- .get_model_subdir(model, dir) 137 | .maybe_download_checkpoint( 138 | model = model, 139 | dir = dir, 140 | ckpt_dir = ckpt_dir 141 | ) 142 | 143 | # If we made it here, they have the model, so set the file locations. 144 | vocab_file <- find_vocab(ckpt_dir) 145 | bert_config_file <- find_config(ckpt_dir) 146 | init_checkpoint <- find_ckpt(ckpt_dir) 147 | } 148 | } 149 | return( 150 | list( 151 | vocab_file = vocab_file, 152 | bert_config_file = bert_config_file, 153 | init_checkpoint = init_checkpoint 154 | ) 155 | ) 156 | } 157 | -------------------------------------------------------------------------------- /RBERT.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace,vignette 22 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | # RBERT 16 | 17 | 18 | [![Lifecycle: superseded](https://img.shields.io/badge/lifecycle-superseded-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html#superseded) 19 | [![Travis build status](https://travis-ci.org/jonathanbratt/RBERT.svg?branch=master)](https://travis-ci.org/jonathanbratt/RBERT) 20 | [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/jonathanbratt/RBERT?branch=master&svg=true)](https://ci.appveyor.com/project/jonathanbratt/RBERT) 21 | [![Codecov test coverage](https://codecov.io/gh/jonathanbratt/RBERT/branch/master/graph/badge.svg)](https://codecov.io/gh/jonathanbratt/RBERT?branch=master) 22 | 23 | 24 | We are re-implementing BERT for R in [{torchtransformers}](https://github.com/macmillancontentscience/torchtransformers). We find {torch} much easier to work with in R than {tensorflow}, and strongly recommend starting there! 25 | 26 | --- 27 | 28 | RBERT is an R implementation of the Python package [BERT](https://github.com/google-research/bert) developed at Google for Natural Language Processing. 29 | 30 | ## Installation 31 | 32 | You can install RBERT from [GitHub](https://github.com/) with: 33 | 34 | ```{r installation, eval = FALSE} 35 | # install.packages("devtools") 36 | devtools::install_github( 37 | "jonathanbratt/RBERT", 38 | build_vignettes = TRUE 39 | ) 40 | ``` 41 | 42 | ### TensorFlow Installation 43 | 44 | RBERT requires TensorFlow. Currently the version must be <= 1.13.1. You can install it using the tensorflow package (installed as a dependency of this package; see note below about Windows). 45 | 46 | ```{r tensorflow, eval = FALSE} 47 | tensorflow::install_tensorflow(version = "1.13.1") 48 | ``` 49 | 50 | ### Windows 51 | 52 | The current CRAN version of reticulate (1.13) causes some issues with the tensorflow installation. Rebooting your machine after installing Anaconda seems to fix this issue, or upgrade to the development version of reticulate. 53 | 54 | ```{r install dev reticulate, eval = FALSE} 55 | devtools::install_github("rstudio/reticulate") 56 | ``` 57 | 58 | ## Basic usage 59 | 60 | RBERT is a work in progress. While fine-tuning a BERT model using RBERT may be possible, it is not currently recommended. 61 | 62 | RBERT is best suited for exploring pre-trained BERT models, and obtaining contextual representations of input text for use as features in downstream tasks. 63 | 64 | * See the "Introduction to RBERT" vignette included with the package for more specific examples. 65 | * For a quick explanation of what BERT is, see the "BERT Basics" vignette. 66 | * The package [RBERTviz](https://github.com/jonathanbratt/RBERTviz) provides tools for making fun and easy visualizations of BERT data. 67 | 68 | ## Running Tests 69 | 70 | The first time you run the test suite, the 388.8MB bert_base_uncased.zip file will download in your `tests/testthat/test_checkpoints` directory. Subsequent test runs will use that download. This was our best compromise to allow for relatively rapid testing without bloating the repository. 71 | 72 | ## Disclaimer 73 | 74 | This is not an officially supported Macmillan Learning product. 75 | 76 | ## Contact information 77 | 78 | Questions or comments should be directed to Jonathan Bratt (jonathan.bratt@macmillan.com) and Jon Harmon (jon.harmon@macmillan.com). 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # RBERT 5 | 6 | 7 | 8 | [![Lifecycle: 9 | superseded](https://img.shields.io/badge/lifecycle-superseded-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html#superseded) 10 | [![Travis build 11 | status](https://travis-ci.org/jonathanbratt/RBERT.svg?branch=master)](https://travis-ci.org/jonathanbratt/RBERT) 12 | [![AppVeyor build 13 | status](https://ci.appveyor.com/api/projects/status/github/jonathanbratt/RBERT?branch=master&svg=true)](https://ci.appveyor.com/project/jonathanbratt/RBERT) 14 | [![Codecov test 15 | coverage](https://codecov.io/gh/jonathanbratt/RBERT/branch/master/graph/badge.svg)](https://codecov.io/gh/jonathanbratt/RBERT?branch=master) 16 | 17 | 18 | We are re-implementing BERT for R in 19 | [{torchtransformers}](https://github.com/macmillancontentscience/torchtransformers). 20 | We find {torch} much easier to work with in R than {tensorflow}, and 21 | strongly recommend starting there! 22 | 23 | ------------------------------------------------------------------------ 24 | 25 | RBERT is an R implementation of the Python package 26 | [BERT](https://github.com/google-research/bert) developed at Google for 27 | Natural Language Processing. 28 | 29 | ## Installation 30 | 31 | You can install RBERT from [GitHub](https://github.com/) with: 32 | 33 | ``` r 34 | # install.packages("devtools") 35 | devtools::install_github( 36 | "jonathanbratt/RBERT", 37 | build_vignettes = TRUE 38 | ) 39 | ``` 40 | 41 | ### TensorFlow Installation 42 | 43 | RBERT requires TensorFlow. Currently the version must be \<= 1.13.1. You 44 | can install it using the tensorflow package (installed as a dependency 45 | of this package; see note below about Windows). 46 | 47 | ``` r 48 | tensorflow::install_tensorflow(version = "1.13.1") 49 | ``` 50 | 51 | ### Windows 52 | 53 | The current CRAN version of reticulate (1.13) causes some issues with 54 | the tensorflow installation. Rebooting your machine after installing 55 | Anaconda seems to fix this issue, or upgrade to the development version 56 | of reticulate. 57 | 58 | ``` r 59 | devtools::install_github("rstudio/reticulate") 60 | ``` 61 | 62 | ## Basic usage 63 | 64 | RBERT is a work in progress. While fine-tuning a BERT model using RBERT 65 | may be possible, it is not currently recommended. 66 | 67 | RBERT is best suited for exploring pre-trained BERT models, and 68 | obtaining contextual representations of input text for use as features 69 | in downstream tasks. 70 | 71 | - See the “Introduction to RBERT” vignette included with the package 72 | for more specific examples. 73 | - For a quick explanation of what BERT is, see the “BERT Basics” 74 | vignette. 75 | - The package [RBERTviz](https://github.com/jonathanbratt/RBERTviz) 76 | provides tools for making fun and easy visualizations of BERT data. 77 | 78 | ## Running Tests 79 | 80 | The first time you run the test suite, the 388.8MB bert_base_uncased.zip 81 | file will download in your `tests/testthat/test_checkpoints` directory. 82 | Subsequent test runs will use that download. This was our best 83 | compromise to allow for relatively rapid testing without bloating the 84 | repository. 85 | 86 | ## Disclaimer 87 | 88 | This is not an officially supported Macmillan Learning product. 89 | 90 | ## Contact information 91 | 92 | Questions or comments should be directed to Jonathan Bratt 93 | () and Jon Harmon 94 | (). 95 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # DO NOT CHANGE the "init" and "install" sections below 2 | 3 | environment: 4 | global: 5 | USE_RTOOLS: true 6 | R_REMOTES_STANDALONE: true 7 | matrix: 8 | - TF_VERSION: 1.12.0 9 | - TF_VERSION: 1.13.1 10 | # - TF_VERSION: 1.14.0 11 | # - TF_VERSION: 2.0.0-rc0 12 | 13 | # Download script file from GitHub 14 | init: 15 | ps: | 16 | $ErrorActionPreference = "Stop" 17 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 18 | Import-Module '..\appveyor-tool.ps1' 19 | 20 | install: 21 | ps: Bootstrap 22 | 23 | cache: 24 | - C:\RLibrary 25 | 26 | # Adapt as necessary starting from here 27 | 28 | build_script: 29 | - travis-tool.sh install_deps 30 | - R CMD INSTALL . 31 | - R -e "install.packages('devtools', repos = 'http://cran.rstudio.com'); devtools::install_github('rstudio/reticulate')" 32 | - R -e "tensorflow::install_tensorflow(method = 'conda', version = Sys.getenv('TF_VERSION'), extra_packages = 'IPython', envname = 'r-reticulate')" 33 | 34 | test_script: 35 | - travis-tool.sh run_tests 36 | 37 | on_failure: 38 | - 7z a failure.zip *.Rcheck\* 39 | - appveyor PushArtifact failure.zip 40 | 41 | artifacts: 42 | - path: '*.Rcheck\**\*.log' 43 | name: Logs 44 | 45 | - path: '*.Rcheck\**\*.out' 46 | name: Logs 47 | 48 | - path: '*.Rcheck\**\*.fail' 49 | name: Logs 50 | 51 | - path: '*.Rcheck\**\*.Rout' 52 | name: Logs 53 | 54 | - path: '\*_*.tar.gz' 55 | name: Bits 56 | 57 | - path: '\*_*.zip' 58 | name: Bits 59 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | patch: 10 | default: 11 | target: auto 12 | threshold: 1% 13 | -------------------------------------------------------------------------------- /data-raw/sysdata.R: -------------------------------------------------------------------------------- 1 | library(magrittr) 2 | google_base_url <- "https://storage.googleapis.com/bert_models/" 3 | scibert_base_url <- "https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/tensorflow_models/" 4 | 5 | checkpoint_url_map <- c( 6 | "bert_base_uncased" = paste0( 7 | google_base_url, 8 | "2018_10_18/uncased_L-12_H-768_A-12.zip" 9 | ), 10 | "bert_base_cased" = paste0( 11 | google_base_url, 12 | "2018_10_18/cased_L-12_H-768_A-12.zip" 13 | ), 14 | "bert_large_uncased" = paste0( 15 | google_base_url, 16 | "2018_10_18/uncased_L-24_H-1024_A-16.zip" 17 | ), 18 | "bert_large_cased" = paste0( 19 | google_base_url, 20 | "2018_10_18/cased_L-24_H-1024_A-16.zip" 21 | ), 22 | "bert_large_uncased_wwm" = paste0( 23 | google_base_url, 24 | "2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip" 25 | ), 26 | "bert_large_cased_wwm" = paste0( 27 | google_base_url, 28 | "2019_05_30/wwm_cased_L-24_H-1024_A-16.zip" 29 | ), 30 | "bert_base_multilingual_cased" = paste0( 31 | google_base_url, 32 | "2018_11_23/multi_cased_L-12_H-768_A-12.zip" 33 | ), 34 | "bert_base_chinese" = paste0( 35 | google_base_url, 36 | "2018_11_03/chinese_L-12_H-768_A-12.zip" 37 | ), 38 | "scibert_scivocab_uncased" = paste0( 39 | scibert_base_url, 40 | "scibert_scivocab_uncased.tar.gz" 41 | ), 42 | "scibert_scivocab_cased" = paste0( 43 | scibert_base_url, 44 | "scibert_scivocab_cased.tar.gz" 45 | ), 46 | "scibert_basevocab_uncased" = paste0( 47 | scibert_base_url, 48 | "scibert_basevocab_uncased.tar.gz" 49 | ), 50 | "scibert_basevocab_cased" = paste0( 51 | scibert_base_url, 52 | "scibert_basevocab_cased.tar.gz" 53 | ) 54 | ) 55 | 56 | # I want to convert this to a tibble with more info, but I don't want to 57 | # reformat all that, so I'm using enframe. 58 | checkpoint_url_map <- tibble::enframe( 59 | checkpoint_url_map, 60 | name = "model", value = "url" 61 | ) %>% 62 | dplyr::mutate( 63 | archive_type = c( 64 | rep("zip", 8), 65 | rep("tar-gzip", 4) 66 | ) 67 | ) 68 | 69 | usethis::use_data( 70 | checkpoint_url_map, 71 | internal = TRUE, 72 | overwrite = TRUE 73 | ) 74 | rm( 75 | google_base_url, 76 | scibert_base_url, 77 | checkpoint_url_map 78 | ) 79 | -------------------------------------------------------------------------------- /man/AdamWeightDecayOptimizer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/optimization.R 3 | \name{AdamWeightDecayOptimizer} 4 | \alias{AdamWeightDecayOptimizer} 5 | \title{Constructor for objects of class AdamWeightDecayOptimizer} 6 | \usage{ 7 | AdamWeightDecayOptimizer( 8 | learning_rate, 9 | weight_decay_rate = 0, 10 | beta_1 = 0.9, 11 | beta_2 = 0.999, 12 | epsilon = 1e-06, 13 | exclude_from_weight_decay = NULL, 14 | name = "AdamWeightDecayOptimizer" 15 | ) 16 | } 17 | \arguments{ 18 | \item{learning_rate}{Numeric Tensor (single element?); learning rate.} 19 | 20 | \item{weight_decay_rate}{Numeric; weight decay rate.} 21 | 22 | \item{beta_1}{Numeric; parameter for Adam.} 23 | 24 | \item{beta_2}{Numeric; parameter for Adam.} 25 | 26 | \item{epsilon}{Numeric; a tiny number to put a cap on update size by avoiding 27 | dividing by even smaller numbers.} 28 | 29 | \item{exclude_from_weight_decay}{Character; list of parameter names to 30 | exclude from weight decay.} 31 | 32 | \item{name}{Character; the name of the constructed object.} 33 | } 34 | \value{ 35 | An object of class "AdamWeightDecayOptimizer", which is a (hacky) 36 | modification of the tf.train.Optimizer class. 37 | } 38 | \description{ 39 | A basic Adam optimizer that includes "correct" L2 weight decay. 40 | } 41 | \details{ 42 | Inherits from class tf.train.Optimizer. 43 | \url{https://devdocs.io/tensorflow~python/tf/train/optimizer} 44 | } 45 | \examples{ 46 | \dontrun{ 47 | with(tensorflow::tf$variable_scope("examples", 48 | reuse = tensorflow::tf$AUTO_REUSE 49 | ), { 50 | optimizer <- AdamWeightDecayOptimizer(learning_rate = 0.01) 51 | }) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /man/BasicTokenizer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{BasicTokenizer} 4 | \alias{BasicTokenizer} 5 | \title{Construct objects of BasicTokenizer class.} 6 | \usage{ 7 | BasicTokenizer(do_lower_case = TRUE) 8 | } 9 | \arguments{ 10 | \item{do_lower_case}{Logical; the value to give to the "do_lower_case" 11 | argument in the BasicTokenizer object.} 12 | } 13 | \value{ 14 | an object of class BasicTokenizer 15 | } 16 | \description{ 17 | (I'm not sure that this object-based approach is best for R implementation, 18 | but for now just trying to reproduce python functionality.) 19 | } 20 | \details{ 21 | Has methods: `tokenize.BasicTokenizer()` `run_strip_accents.BasicTokenizer()` 22 | (internal use) `run_split_on_punc.BasicTokenizer()` (internal use) 23 | `tokenize_chinese_chars.BasicTokenizer()` (internal use) 24 | `is_chinese_char.BasicTokenizer()` (internal use) 25 | `clean_text.BasicTokenizer()` (internal use) 26 | } 27 | \examples{ 28 | \dontrun{ 29 | b_tokenizer <- BasicTokenizer(TRUE) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /man/BertConfig.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{BertConfig} 4 | \alias{BertConfig} 5 | \title{Construct objects of BertConfig class} 6 | \usage{ 7 | BertConfig( 8 | vocab_size, 9 | hidden_size = 768L, 10 | num_hidden_layers = 12L, 11 | num_attention_heads = 12L, 12 | intermediate_size = 3072L, 13 | hidden_act = "gelu", 14 | hidden_dropout_prob = 0.1, 15 | attention_probs_dropout_prob = 0.1, 16 | max_position_embeddings = 512L, 17 | type_vocab_size = 16L, 18 | initializer_range = 0.02 19 | ) 20 | } 21 | \arguments{ 22 | \item{vocab_size}{Integer; vocabulary size of \code{inputs_ids} in 23 | \code{BertModel}.} 24 | 25 | \item{hidden_size}{Integer; size of the encoder layers and the pooler layer.} 26 | 27 | \item{num_hidden_layers}{Integer; number of hidden layers in the Transformer 28 | encoder.} 29 | 30 | \item{num_attention_heads}{Integer; number of attention heads for each 31 | attention layer in the Transformer encoder.} 32 | 33 | \item{intermediate_size}{Integer; the size of the "intermediate" (i.e., 34 | feed-forward) layer in the Transformer encoder.} 35 | 36 | \item{hidden_act}{The non-linear activation function (function or string) in 37 | the encoder and pooler.} 38 | 39 | \item{hidden_dropout_prob}{Numeric; the dropout probability for all fully 40 | connected layers in the embeddings, encoder, and pooler.} 41 | 42 | \item{attention_probs_dropout_prob}{Numeric; the dropout ratio for the 43 | attention probabilities.} 44 | 45 | \item{max_position_embeddings}{Integer; the maximum sequence length that this 46 | model might ever be used with. Typically set this to something large just 47 | in case (e.g., 512 or 1024 or 2048).} 48 | 49 | \item{type_vocab_size}{Integer; the vocabulary size of the 50 | \code{token_type_ids} passed into \code{BertModel}.} 51 | 52 | \item{initializer_range}{Numeric; the stdev of the 53 | truncated_normal_initializer for initializing all weight matrices.} 54 | } 55 | \value{ 56 | An object of class BertConfig 57 | } 58 | \description{ 59 | Given a set of values as parameter inputs, construct a BertConfig object with 60 | those values. 61 | } 62 | \examples{ 63 | \dontrun{ 64 | BertConfig(vocab_size = 30522L) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /man/BertModel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{BertModel} 4 | \alias{BertModel} 5 | \title{Construct object of class BertModel} 6 | \usage{ 7 | BertModel( 8 | config, 9 | is_training, 10 | input_ids, 11 | input_mask = NULL, 12 | token_type_ids = NULL, 13 | scope = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{config}{\code{BertConfig} instance.} 18 | 19 | \item{is_training}{Logical; TRUE for training model, FALSE for eval model. 20 | Controls whether dropout will be applied.} 21 | 22 | \item{input_ids}{Int32 Tensor of shape \code{[batch_size, seq_length]}.} 23 | 24 | \item{input_mask}{(optional) Int32 Tensor of shape \code{[batch_size, 25 | seq_length]}.} 26 | 27 | \item{token_type_ids}{(optional) Int32 Tensor of shape \code{[batch_size, 28 | seq_length]}.} 29 | 30 | \item{scope}{(optional) Character; name for variable scope. Defaults to 31 | "bert".} 32 | } 33 | \value{ 34 | An object of class BertModel. 35 | } 36 | \description{ 37 | An object of class BertModel has several elements: 38 | \describe{ 39 | \item{embedding_output}{float Tensor of shape \code{[batch_size, seq_length, 40 | hidden_size]} corresponding to the output of the embedding layer, after 41 | summing the word embeddings with the positional embeddings and the token type 42 | embeddings, then performing layer normalization. This is the input to the 43 | transformer.} 44 | \item{embedding_table}{The table for the token embeddings.} 45 | \item{all_encoder_layers}{A list of float Tensors of shape \code{[batch_size, 46 | seq_length, hidden_size]}, corresponding to all the hidden transformer 47 | layers.} 48 | \item{sequence_output}{float Tensor of shape \code{[batch_size, seq_length, 49 | hidden_size]} corresponding to the final hidden layer of the transformer 50 | encoder.} 51 | \item{pooled_output}{The dense layer on top of the hidden layer for the first 52 | token.} 53 | } 54 | } 55 | \examples{ 56 | \dontrun{ 57 | with(tensorflow::tf$variable_scope("examples", 58 | reuse = tensorflow::tf$AUTO_REUSE 59 | ), { 60 | input_ids <- tensorflow::tf$constant(list( 61 | list(31L, 51L, 99L), 62 | list(15L, 5L, 0L) 63 | )) 64 | 65 | input_mask <- tensorflow::tf$constant(list( 66 | list(1L, 1L, 1L), 67 | list(1L, 1L, 0L) 68 | )) 69 | token_type_ids <- tensorflow::tf$constant(list( 70 | list(0L, 0L, 1L), 71 | list(0L, 2L, 0L) 72 | )) 73 | config <- BertConfig( 74 | vocab_size = 32000L, 75 | hidden_size = 768L, 76 | num_hidden_layers = 8L, 77 | num_attention_heads = 12L, 78 | intermediate_size = 1024L 79 | ) 80 | model <- BertModel( 81 | config = config, 82 | is_training = TRUE, 83 | input_ids = input_ids, 84 | input_mask = input_mask, 85 | token_type_ids = token_type_ids 86 | ) 87 | }) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /man/FullTokenizer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{FullTokenizer} 4 | \alias{FullTokenizer} 5 | \title{Construct objects of FullTokenizer class.} 6 | \usage{ 7 | FullTokenizer(vocab_file, do_lower_case = TRUE) 8 | } 9 | \arguments{ 10 | \item{vocab_file}{Path to text file containing list of vocabulary tokens.} 11 | 12 | \item{do_lower_case}{Logical: do we convert everything to lowercase?} 13 | } 14 | \value{ 15 | An object of class FullTokenizer. 16 | } 17 | \description{ 18 | Construct objects of FullTokenizer class. 19 | } 20 | \examples{ 21 | \dontrun{ 22 | f_tokenizer <- FullTokenizer("vocab.txt", TRUE) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /man/InputExample.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_classifier.R 3 | \name{InputExample} 4 | \alias{InputExample} 5 | \title{Construct objects of class \code{InputExample}} 6 | \usage{ 7 | InputExample(guid, text_a, text_b = NULL, label = NULL) 8 | } 9 | \arguments{ 10 | \item{guid}{Unique id for the example (character or integer?).} 11 | 12 | \item{text_a}{Character; the untokenized text of the first sequence. For 13 | single sequence tasks, only this sequence must be specified.} 14 | 15 | \item{text_b}{(Optional) Character; the untokenized text of the second 16 | sequence. Only must be specified for sequence pair tasks.} 17 | 18 | \item{label}{(Optional) Character; the label of the example. This should be 19 | specified for train and dev examples, but not for test examples.} 20 | } 21 | \value{ 22 | An object of class \code{InputExample}. 23 | } 24 | \description{ 25 | An input example is a single training/test example for simple sequence 26 | classification. 27 | } 28 | \examples{ 29 | \dontrun{ 30 | input_ex <- InputExample(guid = 0, text_a = "Some text to classify.") 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /man/InputExample_EF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_features.R 3 | \name{InputExample_EF} 4 | \alias{InputExample_EF} 5 | \title{Construct objects of class \code{InputExample_EF}} 6 | \usage{ 7 | InputExample_EF(unique_id, text_a, text_b = NULL) 8 | } 9 | \arguments{ 10 | \item{unique_id}{Integer or character; a unique id for this example.} 11 | 12 | \item{text_a}{Character; the untokenized text of the first sequence.} 13 | 14 | \item{text_b}{(Optional) Character; the untokenized text of the second 15 | sequence.} 16 | } 17 | \value{ 18 | An object of class \code{InputExample_EF}. 19 | } 20 | \description{ 21 | An InputExample_EF is a single test example for feature extraction. Note that 22 | this class is similiar to the InputExample class used for simple sequence 23 | classification, but doesn't have a label property. The name of the id 24 | property is also annoyingly different; should eventually standardize better 25 | than the Python folks did. (RBERT issue #28.) 26 | } 27 | \examples{ 28 | input_ex <- InputExample_EF( 29 | unique_id = 1, 30 | text_a = "I work at the bank." 31 | ) 32 | } 33 | -------------------------------------------------------------------------------- /man/InputFeatures.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_classifier.R 3 | \name{InputFeatures} 4 | \alias{InputFeatures} 5 | \title{Construct objects of class \code{InputFeatures}} 6 | \usage{ 7 | InputFeatures( 8 | input_ids, 9 | input_mask, 10 | segment_ids, 11 | label_id, 12 | is_real_example = TRUE 13 | ) 14 | } 15 | \arguments{ 16 | \item{input_ids}{Integer Tensor; the sequence of token ids in this example.} 17 | 18 | \item{input_mask}{Integer Tensor; sequence of 1s (for "real" tokens) and 0s 19 | (for padding tokens).} 20 | 21 | \item{segment_ids}{Integer Tensor; aka token_type_ids. Indicators for which 22 | sentence (or sequence each token belongs to). Classical BERT supports only 23 | 0s and 1s (for first and second sentence, respectively).} 24 | 25 | \item{label_id}{Integer; represents training example classification labels.} 26 | 27 | \item{is_real_example}{Logical; later on this is used as a flag for whether 28 | to "count" this example for calculating accuracy and loss.} 29 | } 30 | \value{ 31 | An object of class \code{InputFeatures}. 32 | } 33 | \description{ 34 | An InputFeatures object is a single set of features of data. 35 | } 36 | \examples{ 37 | \dontrun{ 38 | features <- InputFeatures(input_ids, input_mask, segment_ids, label_id) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /man/WordpieceTokenizer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{WordpieceTokenizer} 4 | \alias{WordpieceTokenizer} 5 | \title{Construct objects of WordpieceTokenizer class.} 6 | \usage{ 7 | WordpieceTokenizer(vocab, unk_token = "[UNK]", max_input_chars_per_word = 200) 8 | } 9 | \arguments{ 10 | \item{vocab}{Recognized vocabulary tokens, as a named integer vector. (Name 11 | is token, value is index.)} 12 | 13 | \item{unk_token}{Token to use for unknown words.} 14 | 15 | \item{max_input_chars_per_word}{Length of longest word we will recognize.} 16 | } 17 | \value{ 18 | an object of class WordpieceTokenizer 19 | } 20 | \description{ 21 | (I'm not sure that this object-based approach is best for R implementation, 22 | but for now just trying to reproduce python functionality.) 23 | } 24 | \details{ 25 | Has method: tokenize.WordpieceTokenizer() 26 | } 27 | \examples{ 28 | \dontrun{ 29 | vocab <- load_vocab(vocab_file = "vocab.txt") 30 | wp_tokenizer <- WordpieceTokenizer(vocab) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /man/apply_to_chars.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{apply_to_chars} 4 | \alias{apply_to_chars} 5 | \title{Apply a function to each character in a string.} 6 | \usage{ 7 | apply_to_chars(text, .f, ...) 8 | } 9 | \arguments{ 10 | \item{text}{A character scalar to process.} 11 | 12 | \item{.f}{The function to apply to each character. Should return a character 13 | scalar, given a single-character input.} 14 | 15 | \item{...}{Other arguments to pass to .f.} 16 | } 17 | \value{ 18 | The character scalar obtained by applying the given function to 19 | each character of the input string, and concatenating the results. 20 | } 21 | \description{ 22 | Utility function for something done a lot in this package. 23 | } 24 | \keyword{internal} 25 | -------------------------------------------------------------------------------- /man/assert_rank.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{assert_rank} 4 | \alias{assert_rank} 5 | \title{Confirm the rank of a tensor} 6 | \usage{ 7 | assert_rank(tensor, expected_rank, name = NULL) 8 | } 9 | \arguments{ 10 | \item{tensor}{A tf.Tensor to check the rank of.} 11 | 12 | \item{expected_rank}{Integer vector or list of integers, expected rank.} 13 | 14 | \item{name}{Optional name of the tensor for the error message.} 15 | } 16 | \value{ 17 | TRUE if the Tensor is of the expected rank (error otherwise). 18 | } 19 | \description{ 20 | Throws an error if the tensor rank is not of the expected rank. 21 | } 22 | \examples{ 23 | \dontrun{ 24 | with(tensorflow::tf$variable_scope("examples", 25 | reuse = tensorflow::tf$AUTO_REUSE 26 | ), { 27 | ids <- tensorflow::tf$get_variable("x", dtype = "int32", shape = 10L) 28 | assert_rank(ids, 1) 29 | assert_rank(ids, 1:2) 30 | assert_rank(ids, 2) 31 | }) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /man/attention_layer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{attention_layer} 4 | \alias{attention_layer} 5 | \title{Build multi-headed attention layer} 6 | \usage{ 7 | attention_layer( 8 | from_tensor, 9 | to_tensor, 10 | attention_mask = NULL, 11 | num_attention_heads = 1L, 12 | size_per_head = 512L, 13 | query_act = NULL, 14 | key_act = NULL, 15 | value_act = NULL, 16 | attention_probs_dropout_prob = 0, 17 | initializer_range = 0.02, 18 | do_return_2d_tensor = FALSE, 19 | batch_size = NULL, 20 | from_seq_length = NULL, 21 | to_seq_length = NULL 22 | ) 23 | } 24 | \arguments{ 25 | \item{from_tensor}{Float Tensor of shape \code{[batch_size, from_seq_length, 26 | from_width]}.} 27 | 28 | \item{to_tensor}{Float Tensor of shape \code{[batch_size, to_seq_length, 29 | to_width]}.} 30 | 31 | \item{attention_mask}{(optional) Integer Tensor of shape \code{[batch_size, 32 | from_seq_length, to_seq_length]}. The values should be 1 or 0. The 33 | attention scores will effectively be set to -infinity for any positions in 34 | the mask that are 0, and will be unchanged for positions that are 1.} 35 | 36 | \item{num_attention_heads}{Integer; number of attention heads.} 37 | 38 | \item{size_per_head}{Integer; size of each attention head.} 39 | 40 | \item{query_act}{(Optional) Activation function for the query transform.} 41 | 42 | \item{key_act}{(Optional) Activation function for the key transform.} 43 | 44 | \item{value_act}{(Optional) Activation function for the value transform.} 45 | 46 | \item{attention_probs_dropout_prob}{(Optional) Numeric; dropout probability 47 | of the attention probabilities.} 48 | 49 | \item{initializer_range}{Numeric; range of the weight initializer.} 50 | 51 | \item{do_return_2d_tensor}{Logical. If TRUE, the output will be of shape 52 | \code{[batch_size * from_seq_length, num_attention_heads * size_per_head]}. 53 | If false, the output will be of shape \code{[batch_size, from_seq_length, 54 | num_attention_heads * size_per_head]}.} 55 | 56 | \item{batch_size}{(Optional) Integer; if the input is 2D, this might (sic) be 57 | the batch size of the 3D version of the \code{from_tensor} and 58 | \code{to_tensor}.} 59 | 60 | \item{from_seq_length}{(Optional) Integer; if the input is 2D, this might be 61 | the seq length of the 3D version of the \code{from_tensor}.} 62 | 63 | \item{to_seq_length}{(Optional) Integer; if the input is 2D, this might be 64 | the seq length of the 3D version of the \code{to_tensor}.} 65 | } 66 | \value{ 67 | float Tensor of shape \code{[batch_size, from_seq_length, 68 | num_attention_heads * size_per_head]}. If \code{do_return_2d_tensor} is 69 | TRUE, it will be flattened to shape \code{[batch_size * from_seq_length, 70 | num_attention_heads * size_per_head]}. 71 | } 72 | \description{ 73 | Performs multi-headed attention from \code{from_tensor} to \code{to_tensor}. 74 | This is an implementation of multi-headed attention based on "Attention is 75 | all you Need". If \code{from_tensor} and \code{to_tensor} are the same, then 76 | this is self-attention. Each timestep in \code{from_tensor} attends to the 77 | corresponding sequence in \code{to_tensor}, and returns a fixed-with vector. 78 | This function first projects \code{from_tensor} into a "query" tensor and 79 | \code{to_tensor} into "key" and "value" tensors. These are (effectively) a 80 | list of tensors of length \code{num_attention_heads}, where each tensor is of 81 | shape \code{[batch_size, seq_length, size_per_head]}. Then, the query and key 82 | tensors are dot-producted and scaled. These are softmaxed to obtain attention 83 | probabilities. The value tensors are then interpolated by these 84 | probabilities, then concatenated back to a single tensor and returned. 85 | } 86 | \details{ 87 | In practice, the multi-headed attention are done with transposes and reshapes 88 | rather than actual separate tensors. 89 | } 90 | \examples{ 91 | \dontrun{ 92 | # Maybe add examples later. For now, this is only called from 93 | # within transformer_model(), so refer to that function. 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /man/bert_config_from_json_file.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{bert_config_from_json_file} 4 | \alias{bert_config_from_json_file} 5 | \title{Load BERT config object from json file} 6 | \usage{ 7 | bert_config_from_json_file(json_file) 8 | } 9 | \arguments{ 10 | \item{json_file}{Character; the path to a json config file.} 11 | } 12 | \value{ 13 | An object of class BertConfig 14 | } 15 | \description{ 16 | Given a path to a json config file, construct a BertConfig object with 17 | appropriate values. 18 | } 19 | \examples{ 20 | \dontrun{ 21 | temp_dir <- tempdir() 22 | json_file <- file.path( 23 | temp_dir, 24 | "BERT_checkpoints", 25 | "uncased_L-12_H-768_A-12", 26 | "bert_config.json" 27 | ) 28 | bert_config_from_json_file(json_file) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /man/check_vocab.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{check_vocab} 4 | \alias{check_vocab} 5 | \title{Check Vocabulary} 6 | \usage{ 7 | check_vocab(words, ckpt_dir = NULL, vocab_file = find_vocab(ckpt_dir)) 8 | } 9 | \arguments{ 10 | \item{words}{Character vector; words to check.} 11 | 12 | \item{ckpt_dir}{Character; path to checkpoint directory. If specified, any 13 | other checkpoint files required by this function (\code{vocab_file}, 14 | \code{bert_config_file}, or \code{init_checkpoint}) will default to 15 | standard filenames within \code{ckpt_dir}.} 16 | 17 | \item{vocab_file}{path to vocabulary file. File is assumed to be a text file, 18 | with one token per line, with the line number corresponding to the index of 19 | that token in the vocabulary.} 20 | } 21 | \value{ 22 | A logical vector containing \code{TRUE} if the corresponding word was 23 | found verbatim in the vocabulary, \code{FALSE} otherwise. 24 | } 25 | \description{ 26 | Given some words and a word piece vocabulary, checks to see if the words are 27 | in the vocabulary. 28 | } 29 | \examples{ 30 | \dontrun{ 31 | BERT_PRETRAINED_DIR <- download_BERT_checkpoint("bert_base_uncased") 32 | to_check <- c("apple", "appl") 33 | check_vocab(words = to_check, ckpt_dir = BERT_PRETRAINED_DIR) # TRUE, FALSE 34 | #' 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /man/clean_text.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{clean_text} 4 | \alias{clean_text} 5 | \title{Perform invalid character removal and whitespace cleanup on text.} 6 | \usage{ 7 | clean_text(text) 8 | } 9 | \arguments{ 10 | \item{text}{A character scalar.} 11 | } 12 | \value{ 13 | Cleaned up text. 14 | } 15 | \description{ 16 | (R implementation of BasicTokenizer._clean_text from 17 | BERT: tokenization.py.) 18 | } 19 | \keyword{internal} 20 | -------------------------------------------------------------------------------- /man/convert_by_vocab.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{convert_by_vocab} 4 | \alias{convert_by_vocab} 5 | \alias{convert_tokens_to_ids} 6 | \alias{convert_ids_to_tokens} 7 | \title{Convert a sequence of tokens/ids using the provided vocab.} 8 | \usage{ 9 | convert_by_vocab(vocab, items) 10 | 11 | convert_tokens_to_ids(vocab, tokens) 12 | 13 | convert_ids_to_tokens(inv_vocab, ids) 14 | } 15 | \arguments{ 16 | \item{vocab}{Vocabulary; provides mapping from index to tokens. (This may 17 | be in fact an "inverse vocabulary", where the names are the indices and 18 | the values are the tokens.)} 19 | 20 | \item{items}{Vector of the keys (names in the vocab vector) to "convert".} 21 | 22 | \item{tokens}{Equivalent to items.} 23 | 24 | \item{inv_vocab}{Equivalent to vocab.} 25 | 26 | \item{ids}{Equivalent to items.} 27 | } 28 | \value{ 29 | Vector of the values in `vocab` corresponding to `items`. 30 | (The names on the returned vector are kept.) 31 | } 32 | \description{ 33 | Convert a sequence of tokens/ids using the provided vocab. 34 | } 35 | \section{Functions}{ 36 | \itemize{ 37 | \item \code{convert_tokens_to_ids}: Wrapper function for specifically converting 38 | tokens to ids. 39 | 40 | \item \code{convert_ids_to_tokens}: Wrapper function for specifically converting 41 | ids to tokens. 42 | }} 43 | 44 | \examples{ 45 | convert_by_vocab(c("token1" = 0, "token2" = 1), "token1") 46 | } 47 | -------------------------------------------------------------------------------- /man/convert_examples_to_features.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_classifier.R 3 | \name{convert_examples_to_features} 4 | \alias{convert_examples_to_features} 5 | \title{Convert \code{InputExample}s to \code{InputFeatures}} 6 | \usage{ 7 | convert_examples_to_features(examples, label_list, max_seq_length, tokenizer) 8 | } 9 | \arguments{ 10 | \item{examples}{List of \code{InputExample}s to convert.} 11 | 12 | \item{label_list}{Character (or integer?); possible labels for examples.} 13 | 14 | \item{max_seq_length}{Integer; the maximum number of tokens that will be 15 | considered together.} 16 | 17 | \item{tokenizer}{A tokenizer object to use (e.g. object of class 18 | FullTokenizer).} 19 | } 20 | \value{ 21 | A list of \code{InputFeatures}. 22 | } 23 | \description{ 24 | Converts a set of \code{InputExample}s to a list of \code{InputFeatures}. 25 | } 26 | \examples{ 27 | \dontrun{ 28 | tokenizer <- FullTokenizer("vocab.txt") 29 | input_ex1 <- InputExample( 30 | guid = 1L, 31 | text_a = "Some text to classify.", 32 | text_b = "More wordy words.", 33 | label = "good" 34 | ) 35 | input_ex2 <- InputExample( 36 | guid = 2L, 37 | text_a = "This is another example.", 38 | text_b = "So many words.", 39 | label = "bad" 40 | ) 41 | feat <- convert_examples_to_features( 42 | examples = list(input_ex1, input_ex2), 43 | label_list = c("good", "bad"), 44 | max_seq_length = 15L, 45 | tokenizer = tokenizer 46 | ) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /man/convert_single_example.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_classifier.R 3 | \name{convert_single_example} 4 | \alias{convert_single_example} 5 | \title{Convert a single \code{InputExample} into a single \code{InputFeatures}} 6 | \usage{ 7 | convert_single_example( 8 | ex_index, 9 | example, 10 | label_list, 11 | max_seq_length, 12 | tokenizer 13 | ) 14 | } 15 | \arguments{ 16 | \item{ex_index}{Integer; the index of this example. This is used to determine 17 | whether or not to print out some log info (for debugging or runtime 18 | confirmation). It is assumed this starts with 1 (in R).} 19 | 20 | \item{example}{The \code{InputExample} to convert.} 21 | 22 | \item{label_list}{Character (or integer); allowed labels for these examples.} 23 | 24 | \item{max_seq_length}{Integer; the maximum number of tokens that will be 25 | considered together.} 26 | 27 | \item{tokenizer}{A tokenizer object to use (e.g. object of class 28 | FullTokenizer).} 29 | } 30 | \value{ 31 | An object of class \code{InputFeatures}. 32 | } 33 | \description{ 34 | Converts a single \code{InputExample} into a single \code{InputFeatures}. 35 | } 36 | \examples{ 37 | \dontrun{ 38 | tokenizer <- FullTokenizer("vocab.txt") 39 | input_ex <- InputExample( 40 | guid = 1L, 41 | text_a = "Some text to classify.", 42 | text_b = "More wordy words.", 43 | label = "good" 44 | ) 45 | feat <- convert_single_example( 46 | ex_index = 1L, 47 | example = input_ex, 48 | label_list = c("good", "bad"), 49 | max_seq_length = 15L, 50 | tokenizer = tokenizer 51 | ) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /man/convert_to_unicode.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{convert_to_unicode} 4 | \alias{convert_to_unicode} 5 | \title{Convert `text` to Unicode} 6 | \usage{ 7 | convert_to_unicode(text) 8 | } 9 | \arguments{ 10 | \item{text}{character scalar to convert to unicode} 11 | } 12 | \value{ 13 | input text, converted to unicode if applicable 14 | } 15 | \description{ 16 | See documentation for `Encoding` for more information. 17 | Assumes utf-8 input. 18 | } 19 | \examples{ 20 | convert_to_unicode("fa\xC3\xA7ile") 21 | } 22 | -------------------------------------------------------------------------------- /man/create_attention_mask_from_input_mask.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{create_attention_mask_from_input_mask} 4 | \alias{create_attention_mask_from_input_mask} 5 | \title{Create 3D attention mask from a 2D tensor mask} 6 | \usage{ 7 | create_attention_mask_from_input_mask(from_tensor, to_mask) 8 | } 9 | \arguments{ 10 | \item{from_tensor}{2D or 3D Tensor of shape [batch_size, from_seq_length, 11 | ...].} 12 | 13 | \item{to_mask}{int32 Tensor of shape [batch_size, to_seq_length].} 14 | } 15 | \value{ 16 | float Tensor of shape [batch_size, from_seq_length, to_seq_length]. 17 | } 18 | \description{ 19 | An attention mask is used to zero out specific elements of an attention 20 | matrix. (For example, to prevent the model from "paying attention to the 21 | answer" in certain training tasks.) 22 | } 23 | \examples{ 24 | \dontrun{ 25 | with(tensorflow::tf$variable_scope("examples", 26 | reuse = tensorflow::tf$AUTO_REUSE 27 | ), { 28 | from_tensor <- ids <- tensorflow::tf$get_variable("ften", 29 | dtype = "float", shape = c(10, 20) 30 | ) 31 | to_mask <- ids <- tensorflow::tf$get_variable("mask", 32 | dtype = "int32", shape = c(10, 30) 33 | ) 34 | }) 35 | create_attention_mask_from_input_mask(from_tensor, to_mask) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /man/create_initializer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{create_initializer} 4 | \alias{create_initializer} 5 | \title{Create truncated normal initializer} 6 | \usage{ 7 | create_initializer(initializer_range = 0.02) 8 | } 9 | \arguments{ 10 | \item{initializer_range}{A double describing the range for the initializer 11 | (passed to the stddev parameter).} 12 | } 13 | \value{ 14 | A tensorflow initializer. 15 | } 16 | \description{ 17 | This is a wrapper around the tensorflow truncated_normal_initializer 18 | function. 19 | } 20 | \examples{ 21 | \dontrun{ 22 | create_initializer(0.02) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /man/create_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_classifier.R 3 | \name{create_model} 4 | \alias{create_model} 5 | \title{Create a classification model} 6 | \usage{ 7 | create_model( 8 | bert_config, 9 | is_training, 10 | input_ids, 11 | input_mask, 12 | segment_ids, 13 | labels, 14 | num_labels 15 | ) 16 | } 17 | \arguments{ 18 | \item{bert_config}{\code{BertConfig} instance.} 19 | 20 | \item{is_training}{Logical; TRUE for training model, FALSE for eval model. 21 | Controls whether dropout will be applied.} 22 | 23 | \item{input_ids}{Integer Tensor of shape \code{[batch_size, seq_length]}.} 24 | 25 | \item{input_mask}{Integer Tensor of shape \code{[batch_size, seq_length]}.} 26 | 27 | \item{segment_ids}{Integer Tensor of shape \code{[batch_size, seq_length]}.} 28 | 29 | \item{labels}{Integer Tensor; represents training example classification 30 | labels. Length = batch size.} 31 | 32 | \item{num_labels}{Integer; number of classification labels.} 33 | } 34 | \value{ 35 | A list including the loss (for training) and the model output 36 | (softmax probabilities, log probs). 37 | } 38 | \description{ 39 | Takes the output layer from a BERT "spine" and appends a classifier layer to 40 | it. The output taken from BERT is the pooled first token layers (may want to 41 | modify the code to use token-level outputs). The classifier is essentially a 42 | single dense layer with softmax. 43 | } 44 | \examples{ 45 | \dontrun{ 46 | with(tensorflow::tf$variable_scope("examples", 47 | reuse = tensorflow::tf$AUTO_REUSE 48 | ), { 49 | input_ids <- tensorflow::tf$constant(list( 50 | list(31L, 51L, 99L), 51 | list(15L, 5L, 0L) 52 | )) 53 | 54 | input_mask <- tensorflow::tf$constant(list( 55 | list(1L, 1L, 1L), 56 | list(1L, 1L, 0L) 57 | )) 58 | token_type_ids <- tensorflow::tf$constant(list( 59 | list(0L, 0L, 1L), 60 | list(0L, 2L, 0L) 61 | )) 62 | config <- BertConfig( 63 | vocab_size = 32000L, 64 | hidden_size = 768L, 65 | num_hidden_layers = 8L, 66 | num_attention_heads = 12L, 67 | intermediate_size = 1024L 68 | ) 69 | class_model <- create_model( 70 | bert_config = config, 71 | is_training = TRUE, 72 | input_ids = input_ids, 73 | input_mask = input_mask, 74 | segment_ids = token_type_ids, 75 | labels = c(1L, 2L), 76 | num_labels = 2L, 77 | ) 78 | }) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /man/create_optimizer.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/optimization.R 3 | \name{create_optimizer} 4 | \alias{create_optimizer} 5 | \title{Create an optimizer training op} 6 | \usage{ 7 | create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu) 8 | } 9 | \arguments{ 10 | \item{loss}{Float Tensor; the loss for this step (calculated elsewhere; in 11 | principle is a function of trainable parameter values).} 12 | 13 | \item{init_lr}{Numeric; initial learning rate.} 14 | 15 | \item{num_train_steps}{Integer; number of steps to train for.} 16 | 17 | \item{num_warmup_steps}{Integer; number of steps to use for "warm-up".} 18 | 19 | \item{use_tpu}{Logical; whether to use TPU.} 20 | } 21 | \value{ 22 | A training op: the result of a tensorflow group() of operations. 23 | } 24 | \description{ 25 | \code{create_optimizer} doesn't actually return the optimizer object; it 26 | returns the operation resulting from a tf.group() call. 27 | } 28 | \details{ 29 | See also: 30 | 31 | \url{https://www.tensorflow.org/api_docs/python/tf/group} 32 | 33 | \url{https://stackoverflow.com/questions/41780655/what-is-the-difference-between-tf-group-and-tf-control-dependencies} 34 | 35 | The routine tf.gradients() is called in the course of this function. 36 | \url{https://www.tensorflow.org/api_docs/python/tf/gradients} 37 | } 38 | \examples{ 39 | \dontrun{ 40 | with(tensorflow::tf$variable_scope("examples", 41 | reuse = tensorflow::tf$AUTO_REUSE 42 | ), { 43 | totrain <- tensorflow::tf$get_variable( 44 | "totrain", 45 | tensorflow::shape(10L, 20L) 46 | ) 47 | loss <- 2 * totrain 48 | 49 | train_op <- create_optimizer( 50 | loss = loss, 51 | init_lr = 0.01, 52 | num_train_steps = 20L, 53 | num_warmup_steps = 10L, 54 | use_tpu = FALSE 55 | ) 56 | }) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /man/dot-InputFeatures_EF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_features.R 3 | \name{.InputFeatures_EF} 4 | \alias{.InputFeatures_EF} 5 | \title{Construct objects of class \code{InputFeatures_FE}} 6 | \usage{ 7 | .InputFeatures_EF(unique_id, tokens, input_ids, input_mask, input_type_ids) 8 | } 9 | \arguments{ 10 | \item{unique_id}{Integer or character; a unique id for this example.} 11 | 12 | \item{tokens}{Character vector; the actual tokens in this example.} 13 | 14 | \item{input_ids}{Integer vector; the sequence of token ids in this example.} 15 | 16 | \item{input_mask}{Integer vector; sequence of 1s (for "real" tokens) and 0s 17 | (for padding tokens).} 18 | 19 | \item{input_type_ids}{Integer vector; aka token_type_ids. Indicators for 20 | which sentence (or sequence) each token belongs to. Classical BERT supports 21 | only 0s and 1s (for first and second sentence, respectively).} 22 | } 23 | \value{ 24 | An object of class \code{InputFeatures_FE}. 25 | } 26 | \description{ 27 | An InputFeatures object is a single set of (input) features of data used for 28 | (output) feature extraction. Note that this class is similiar to the 29 | InputFeatures class used for simple sequence classification, with annoying 30 | differences. Will eventually standardize; till then, check parameter names. 31 | (RBERT issue #28.) 32 | } 33 | \keyword{internal} 34 | -------------------------------------------------------------------------------- /man/dot-choose_BERT_dir.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.choose_BERT_dir} 4 | \alias{.choose_BERT_dir} 5 | \title{Choose a directory for BERT checkpoints} 6 | \usage{ 7 | .choose_BERT_dir(dir) 8 | } 9 | \arguments{ 10 | \item{dir}{Character vector. Destination directory for checkpoints. Leave 11 | \code{NULL} to allow RBERT to automatically choose a directory. The path is 12 | determined from the \code{dir} parameter if supplied, followed by the 13 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT" 14 | folder in the user cache directory (determined using 15 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the 16 | `RBERT.dir` option will be updated to that location. Note that the 17 | checkpoint will create a subdirectory inside this \code{dir}.} 18 | } 19 | \value{ 20 | A character vector indicating a directory in which BERT checkpoints 21 | are stored. 22 | } 23 | \description{ 24 | If \code{dir} is not NULL, this function simply returns \code{dir}. Otherwise 25 | it checks the `RBERT.dir` param, and then uses 26 | \code{\link[rappdirs]{user_cache_dir}} to choose a directory if necessary. 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/dot-convert_examples_to_features_EF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_features.R 3 | \name{.convert_examples_to_features_EF} 4 | \alias{.convert_examples_to_features_EF} 5 | \title{Convert \code{InputExample_EF}s to \code{InputFeatures_EF}} 6 | \usage{ 7 | .convert_examples_to_features_EF(examples, seq_length, tokenizer) 8 | } 9 | \arguments{ 10 | \item{examples}{List of \code{InputExample_EF}s to convert.} 11 | 12 | \item{seq_length}{Integer; the maximum number of tokens that will be 13 | considered together.} 14 | 15 | \item{tokenizer}{A tokenizer object to use (e.g. object of class 16 | FullTokenizer).} 17 | } 18 | \value{ 19 | A list of \code{InputFeatures}. 20 | } 21 | \description{ 22 | Converts a set of \code{InputExample_EF}s to a list of 23 | \code{InputFeatures_EF}. Very similar to \code{convert_examples_to_features} 24 | from run_classifier.R. (RBERT issue #28.) 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/dot-convert_single_example_EF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_features.R 3 | \name{.convert_single_example_EF} 4 | \alias{.convert_single_example_EF} 5 | \title{Convert a single \code{InputExample_EF} into a single \code{InputFeatures_EF}} 6 | \usage{ 7 | .convert_single_example_EF(ex_index, example, seq_length, tokenizer) 8 | } 9 | \arguments{ 10 | \item{ex_index}{Integer; the index of this example. This is used to determine 11 | whether or not to print out some log info (for debugging or runtime 12 | confirmation). It is assumed this starts with 1 (in R).} 13 | 14 | \item{example}{The \code{InputExample_EF} to convert.} 15 | 16 | \item{seq_length}{Integer; the maximum number of tokens that will be 17 | considered together.} 18 | 19 | \item{tokenizer}{A tokenizer object to use (e.g. object of class 20 | FullTokenizer).} 21 | } 22 | \value{ 23 | An object of class \code{InputFeatures_EF}. 24 | } 25 | \description{ 26 | Converts a single \code{InputExample_EF} into a single 27 | \code{InputFeatures_EF}. Very similar to \code{convert_single_example} from 28 | run_classifier.R. (RBERT issue #28.) 29 | } 30 | \keyword{internal} 31 | -------------------------------------------------------------------------------- /man/dot-download_BERT_checkpoint.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.download_BERT_checkpoint} 4 | \alias{.download_BERT_checkpoint} 5 | \title{Download a checkpoint zip file} 6 | \usage{ 7 | .download_BERT_checkpoint(url, checkpoint_zip_path) 8 | } 9 | \arguments{ 10 | \item{url}{Character vector. An optional url from which to download a 11 | checkpoint. Overrides \code{model} parameter if not NULL.} 12 | 13 | \item{checkpoint_zip_path}{The path to which the checkpoint zip should be 14 | downloaded.} 15 | } 16 | \value{ 17 | \code{TRUE} invisibly. 18 | } 19 | \description{ 20 | Download a checkpoint zip file 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/dot-get_actual_index.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_features.R 3 | \name{.get_actual_index} 4 | \alias{.get_actual_index} 5 | \title{Standardize Indices} 6 | \usage{ 7 | .get_actual_index(index, length) 8 | } 9 | \arguments{ 10 | \item{index}{Integer; the index to normalize.} 11 | 12 | \item{length}{Integer; the length of the vector or list we are indexing.} 13 | } 14 | \value{ 15 | The "actual" integer index, between 1 and \code{length}, inclusive. 16 | } 17 | \description{ 18 | Convert negative indices to positive ones. Use the convention that 19 | \code{vec[[-1L]]} signifies the last element of \code{vec}, \code{vec[[-2L]]} 20 | signifies the second-to-last element of \code{vec}, and so on. 1-based 21 | indexing is assumed. Values of zero, or out-of-range indices, will be 22 | rejected. 23 | } 24 | \keyword{internal} 25 | -------------------------------------------------------------------------------- /man/dot-get_model_archive_path.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.get_model_archive_path} 4 | \alias{.get_model_archive_path} 5 | \title{Locate an archive file for a BERT checkpoint} 6 | \usage{ 7 | .get_model_archive_path(model, dir, archive_type) 8 | } 9 | \arguments{ 10 | \item{model}{Character vector. Which model checkpoint to download.} 11 | 12 | \item{dir}{Character vector. Destination directory for checkpoints. Leave 13 | \code{NULL} to allow RBERT to automatically choose a directory. The path is 14 | determined from the \code{dir} parameter if supplied, followed by the 15 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT" 16 | folder in the user cache directory (determined using 17 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the 18 | `RBERT.dir` option will be updated to that location. Note that the 19 | checkpoint will create a subdirectory inside this \code{dir}.} 20 | 21 | \item{archive_type}{How is the checkpoint archived? We currently support 22 | "zip" and "tar-gzip". Leave NULL to infer from the \code{url}.} 23 | } 24 | \value{ 25 | The path to the archive file where the raw checkpoint should be 26 | saved. 27 | } 28 | \description{ 29 | Locate an archive file for a BERT checkpoint 30 | } 31 | \keyword{internal} 32 | -------------------------------------------------------------------------------- /man/dot-get_model_archive_type.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.get_model_archive_type} 4 | \alias{.get_model_archive_type} 5 | \title{Get archive type of a BERT checkpoint} 6 | \usage{ 7 | .get_model_archive_type(model) 8 | } 9 | \arguments{ 10 | \item{model}{Character vector. Which model checkpoint to download.} 11 | } 12 | \value{ 13 | The archive type to the specified BERT model. 14 | } 15 | \description{ 16 | Returns the archive type ("zip" or "tar-gzip") of the specified BERT 17 | checkpoint from the Google Research collection or other repository. 18 | } 19 | \keyword{internal} 20 | -------------------------------------------------------------------------------- /man/dot-get_model_subdir.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.get_model_subdir} 4 | \alias{.get_model_subdir} 5 | \title{Locate a subdir for a BERT checkpoint} 6 | \usage{ 7 | .get_model_subdir(model, dir) 8 | } 9 | \arguments{ 10 | \item{model}{Character vector. Which model checkpoint to download.} 11 | 12 | \item{dir}{Character vector. Destination directory for checkpoints. Leave 13 | \code{NULL} to allow RBERT to automatically choose a directory. The path is 14 | determined from the \code{dir} parameter if supplied, followed by the 15 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT" 16 | folder in the user cache directory (determined using 17 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the 18 | `RBERT.dir` option will be updated to that location. Note that the 19 | checkpoint will create a subdirectory inside this \code{dir}.} 20 | } 21 | \value{ 22 | The path to the sub-directory where the checkpoint should be saved. 23 | } 24 | \description{ 25 | Locate a subdir for a BERT checkpoint 26 | } 27 | \keyword{internal} 28 | -------------------------------------------------------------------------------- /man/dot-get_model_url.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.get_model_url} 4 | \alias{.get_model_url} 5 | \title{Get url of a BERT checkpoint} 6 | \usage{ 7 | .get_model_url(model) 8 | } 9 | \arguments{ 10 | \item{model}{Character vector. Which model checkpoint to download.} 11 | } 12 | \value{ 13 | The url to the specified BERT model. 14 | } 15 | \description{ 16 | Returns the url of the specified BERT checkpoint from the Google Research 17 | collection or other repository. 18 | } 19 | \keyword{internal} 20 | -------------------------------------------------------------------------------- /man/dot-has_checkpoint.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.has_checkpoint} 4 | \alias{.has_checkpoint} 5 | \title{Check whether the user already has a checkpoint} 6 | \usage{ 7 | .has_checkpoint(model = NULL, dir = NULL, ckpt_dir = NULL) 8 | } 9 | \arguments{ 10 | \item{model}{Character vector. Which model checkpoint to download.} 11 | 12 | \item{dir}{Character vector. Destination directory for checkpoints. Leave 13 | \code{NULL} to allow RBERT to automatically choose a directory. The path is 14 | determined from the \code{dir} parameter if supplied, followed by the 15 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT" 16 | folder in the user cache directory (determined using 17 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the 18 | `RBERT.dir` option will be updated to that location. Note that the 19 | checkpoint will create a subdirectory inside this \code{dir}.} 20 | 21 | \item{ckpt_dir}{The path to the subdir where this checkpoint should 22 | be saved. If model is given, ckpt_dir is inferred.} 23 | } 24 | \value{ 25 | A logical indicating whether the user already has that checkpoint in 26 | that location. 27 | } 28 | \description{ 29 | Check the specified dir (or the default dir if none is specified) for a given 30 | model or url. 31 | } 32 | \keyword{internal} 33 | -------------------------------------------------------------------------------- /man/dot-infer_archive_type.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.infer_archive_type} 4 | \alias{.infer_archive_type} 5 | \title{Infer the archive type for a BERT checkpoint} 6 | \usage{ 7 | .infer_archive_type(url) 8 | } 9 | \arguments{ 10 | \item{url}{Character vector. An optional url from which to download a 11 | checkpoint. Overrides \code{model} parameter if not NULL.} 12 | } 13 | \value{ 14 | A character vector, currently either "zip" or "tar-gzip". 15 | } 16 | \description{ 17 | Infer the archive type for a BERT checkpoint 18 | } 19 | \keyword{internal} 20 | -------------------------------------------------------------------------------- /man/dot-infer_checkpoint_archive_path.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.infer_checkpoint_archive_path} 4 | \alias{.infer_checkpoint_archive_path} 5 | \title{Infer the path to the archive for a BERT checkpoint} 6 | \usage{ 7 | .infer_checkpoint_archive_path(url, dir) 8 | } 9 | \arguments{ 10 | \item{url}{Character vector. An optional url from which to download a 11 | checkpoint. Overrides \code{model} parameter if not NULL.} 12 | 13 | \item{dir}{Character vector. Destination directory for checkpoints. Leave 14 | \code{NULL} to allow RBERT to automatically choose a directory. The path is 15 | determined from the \code{dir} parameter if supplied, followed by the 16 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT" 17 | folder in the user cache directory (determined using 18 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the 19 | `RBERT.dir` option will be updated to that location. Note that the 20 | checkpoint will create a subdirectory inside this \code{dir}.} 21 | } 22 | \value{ 23 | A character vector file path, pointing to where the raw checkpoint 24 | archive should be saved. 25 | } 26 | \description{ 27 | Infer the path to the archive for a BERT checkpoint 28 | } 29 | \keyword{internal} 30 | -------------------------------------------------------------------------------- /man/dot-infer_ckpt_dir.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.infer_ckpt_dir} 4 | \alias{.infer_ckpt_dir} 5 | \title{Infer the subdir for a BERT checkpoint} 6 | \usage{ 7 | .infer_ckpt_dir(url, dir) 8 | } 9 | \arguments{ 10 | \item{url}{Character vector. An optional url from which to download a 11 | checkpoint. Overrides \code{model} parameter if not NULL.} 12 | 13 | \item{dir}{Character vector. Destination directory for checkpoints. Leave 14 | \code{NULL} to allow RBERT to automatically choose a directory. The path is 15 | determined from the \code{dir} parameter if supplied, followed by the 16 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT" 17 | folder in the user cache directory (determined using 18 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the 19 | `RBERT.dir` option will be updated to that location. Note that the 20 | checkpoint will create a subdirectory inside this \code{dir}.} 21 | } 22 | \value{ 23 | A character vector file path, reflecting the "name" part of a 24 | checkpoint \code{url}, placed within \code{dir}. 25 | } 26 | \description{ 27 | Infer the subdir for a BERT checkpoint 28 | } 29 | \keyword{internal} 30 | -------------------------------------------------------------------------------- /man/dot-infer_model_paths.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{.infer_model_paths} 4 | \alias{.infer_model_paths} 5 | \title{Find Paths to Checkpoint Files} 6 | \usage{ 7 | .infer_model_paths( 8 | model = c("bert_base_uncased", "bert_base_cased", "bert_large_uncased", 9 | "bert_large_cased", "bert_large_uncased_wwm", "bert_large_cased_wwm", 10 | "bert_base_multilingual_cased", "bert_base_chinese", "scibert_scivocab_uncased", 11 | "scibert_scivocab_cased", "scibert_basevocab_uncased", "scibert_basevocab_cased"), 12 | ckpt_dir = NULL, 13 | vocab_file = find_vocab(ckpt_dir), 14 | bert_config_file = find_config(ckpt_dir), 15 | init_checkpoint = find_ckpt(ckpt_dir) 16 | ) 17 | } 18 | \arguments{ 19 | \item{model}{Character; which model checkpoint to use. If specified, 20 | \code{ckpt_dir}, code{vocab_file}, \code{bert_config_file}, and 21 | \code{init_checkpoint} will be inferred. If you do not have this 22 | checkpoint, you will be prompted to download it in interactive mode.} 23 | 24 | \item{ckpt_dir}{Character; path to checkpoint directory. If specified, any 25 | other checkpoint files required by this function (\code{vocab_file}, 26 | \code{bert_config_file}, or \code{init_checkpoint}) will default to 27 | standard filenames within \code{ckpt_dir}.} 28 | 29 | \item{vocab_file}{path to vocabulary file. File is assumed to be a text file, 30 | with one token per line, with the line number corresponding to the index of 31 | that token in the vocabulary.} 32 | 33 | \item{bert_config_file}{Character; the path to a json config file.} 34 | 35 | \item{init_checkpoint}{Character; path to the checkpoint directory, plus 36 | checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and 37 | explicit, starting with "/".} 38 | } 39 | \value{ 40 | A list with components vocab_file, bert_config_file, and 41 | init_checkpoint. 42 | } 43 | \description{ 44 | In some functions, the user can specify a model, a ckpt_dir, and/or specific 45 | paths to checkpoint files. This function sorts all of that out. 46 | } 47 | \keyword{internal} 48 | -------------------------------------------------------------------------------- /man/dot-maybe_download_checkpoint.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.maybe_download_checkpoint} 4 | \alias{.maybe_download_checkpoint} 5 | \title{Find or Possibly Download a Checkpoint} 6 | \usage{ 7 | .maybe_download_checkpoint( 8 | model = c("bert_base_uncased", "bert_base_cased", "bert_large_uncased", 9 | "bert_large_cased", "bert_large_uncased_wwm", "bert_large_cased_wwm", 10 | "bert_base_multilingual_cased", "bert_base_chinese", "scibert_scivocab_uncased", 11 | "scibert_scivocab_cased", "scibert_basevocab_uncased", "scibert_basevocab_cased"), 12 | dir = NULL, 13 | ckpt_dir = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{model}{Character vector. Which model checkpoint to download.} 18 | 19 | \item{dir}{Character vector. Destination directory for checkpoints. Leave 20 | \code{NULL} to allow RBERT to automatically choose a directory. The path is 21 | determined from the \code{dir} parameter if supplied, followed by the 22 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT" 23 | folder in the user cache directory (determined using 24 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the 25 | `RBERT.dir` option will be updated to that location. Note that the 26 | checkpoint will create a subdirectory inside this \code{dir}.} 27 | 28 | \item{ckpt_dir}{The path to the subdir where this checkpoint should 29 | be saved. If model is given, ckpt_dir is inferred.} 30 | } 31 | \value{ 32 | TRUE (invisibly) 33 | } 34 | \description{ 35 | Verify that the user has a specified checkpoint, and prompt to download if 36 | they don't (in interactive mode). 37 | } 38 | \keyword{internal} 39 | -------------------------------------------------------------------------------- /man/dot-model_fn_builder_EF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_features.R 3 | \name{.model_fn_builder_EF} 4 | \alias{.model_fn_builder_EF} 5 | \title{Define \code{model_fn} closure for \code{TPUEstimator}} 6 | \usage{ 7 | .model_fn_builder_EF(bert_config, init_checkpoint, layer_indexes, use_tpu) 8 | } 9 | \arguments{ 10 | \item{bert_config}{\code{BertConfig} instance.} 11 | 12 | \item{init_checkpoint}{Character; path to the checkpoint directory, plus 13 | checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and 14 | explicit, starting with "/".} 15 | 16 | \item{layer_indexes}{Integer list; indexes (positive, or negative counting 17 | back from the end) indicating which layers to extract as "output features". 18 | (It needs to be specified here because we get them back as the model 19 | "predictions".)} 20 | 21 | \item{use_tpu}{Logical; whether to use TPU.} 22 | } 23 | \value{ 24 | \code{model_fn} closure for \code{TPUEstimator}. 25 | } 26 | \description{ 27 | Returns \code{model_fn} closure, which is an input to \code{TPUEstimator}. 28 | This function is similar to \code{model_fn_builder} from run_classifier.R. 29 | (RBERT issue #28.) 30 | } 31 | \details{ 32 | The \code{model_fn} function takes four parameters: \describe{ 33 | \item{features}{A list (or similar structure) that contains objects such as 34 | \code{input_ids}, \code{input_mask}, \code{tokens}, and 35 | \code{input_type_ids}. These objects will be inputs to the 36 | \code{create_model} function.} \item{labels}{Not used in this function, but 37 | presumably we need to keep this slot here.} \item{mode}{Character; value such 38 | as "train", "infer", or "eval".} \item{params}{Not used in this function, but 39 | presumably we need to keep this slot here.} } 40 | 41 | The output of \code{model_fn} is the result of a 42 | \code{tf$contrib$tpu$TPUEstimatorSpec} call. 43 | } 44 | \keyword{internal} 45 | -------------------------------------------------------------------------------- /man/dot-process_BERT_checkpoint.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{.process_BERT_checkpoint} 4 | \alias{.process_BERT_checkpoint} 5 | \title{Unzip and check a BERT checkpoint zip} 6 | \usage{ 7 | .process_BERT_checkpoint(dir, checkpoint_archive_path, ckpt_dir, archive_type) 8 | } 9 | \arguments{ 10 | \item{dir}{Character vector. Destination directory for checkpoints. Leave 11 | \code{NULL} to allow RBERT to automatically choose a directory. The path is 12 | determined from the \code{dir} parameter if supplied, followed by the 13 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT" 14 | folder in the user cache directory (determined using 15 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the 16 | `RBERT.dir` option will be updated to that location. Note that the 17 | checkpoint will create a subdirectory inside this \code{dir}.} 18 | 19 | \item{archive_type}{How is the checkpoint archived? We currently support 20 | "zip" and "tar-gzip". Leave NULL to infer from the \code{url}.} 21 | } 22 | \value{ 23 | \code{TRUE} invisibly. 24 | } 25 | \description{ 26 | Unzip and check a BERT checkpoint zip 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/download_BERT_checkpoint.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{download_BERT_checkpoint} 4 | \alias{download_BERT_checkpoint} 5 | \title{Download a BERT checkpoint} 6 | \source{ 7 | \url{https://github.com/google-research/bert} 8 | 9 | \url{https://github.com/allenai/scibert} 10 | } 11 | \usage{ 12 | download_BERT_checkpoint( 13 | model = c("bert_base_uncased", "bert_base_cased", "bert_large_uncased", 14 | "bert_large_cased", "bert_large_uncased_wwm", "bert_large_cased_wwm", 15 | "bert_base_multilingual_cased", "bert_base_chinese", "scibert_scivocab_uncased", 16 | "scibert_scivocab_cased", "scibert_basevocab_uncased", "scibert_basevocab_cased"), 17 | dir = NULL, 18 | url = NULL, 19 | force = FALSE, 20 | keep_archive = FALSE, 21 | archive_type = NULL 22 | ) 23 | } 24 | \arguments{ 25 | \item{model}{Character vector. Which model checkpoint to download.} 26 | 27 | \item{dir}{Character vector. Destination directory for checkpoints. Leave 28 | \code{NULL} to allow RBERT to automatically choose a directory. The path is 29 | determined from the \code{dir} parameter if supplied, followed by the 30 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT" 31 | folder in the user cache directory (determined using 32 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the 33 | `RBERT.dir` option will be updated to that location. Note that the 34 | checkpoint will create a subdirectory inside this \code{dir}.} 35 | 36 | \item{url}{Character vector. An optional url from which to download a 37 | checkpoint. Overrides \code{model} parameter if not NULL.} 38 | 39 | \item{force}{Logical. Download even if the checkpoint already exists in the 40 | specified directory? Default \code{FALSE}.} 41 | 42 | \item{keep_archive}{Logical. Keep the zip (or other archive) file? Leave as 43 | \code{FALSE} to save space.} 44 | 45 | \item{archive_type}{How is the checkpoint archived? We currently support 46 | "zip" and "tar-gzip". Leave NULL to infer from the \code{url}.} 47 | } 48 | \value{ 49 | If successful, returns the path to the downloaded checkpoint. 50 | } 51 | \description{ 52 | Downloads the specified BERT checkpoint from the Google Research collection 53 | or other repositories. 54 | } 55 | \section{Checkpoints}{ 56 | \code{download_BERT_checkpoint} knows about several 57 | pre-trained BERT checkpoints. You can specify these checkpoints using the 58 | \code{model} parameter. Alternatively, you can supply a direct \code{url} 59 | to any BERT tensorflow checkpoint. 60 | 61 | \tabular{rccccl}{ model \tab layers \tab hidden \tab heads \tab parameters 62 | \tab special\cr bert_base_* \tab 12 \tab 768 \tab 12 \tab 110M\cr 63 | bert_large_* \tab 24 \tab 1024 \tab 16 \tab 340M\cr bert_large_*_wwm \tab 64 | 24 \tab 1024 \tab 16 \tab 340M \tab whole word masking\cr 65 | bert_base_multilingual_cased \tab 12 \tab 768 \tab 12 \tab 110M \tab 104 66 | languages\cr bert_base_chinese \tab 12 \tab 768 \tab 12 \tab 110M \tab 67 | Chinese Simplified and Traditional\cr scibert_scivocab_* \tab 12 \tab 768 68 | \tab 12 \tab 110M \tab Trained using the full text of 1.14M scientific 69 | papers (18\% computer science, 82\% biomedical), with a science-specific 70 | vocabulary.\cr scibert_basevocab_uncased \tab 12 \tab 768 \tab 12 \tab 110M 71 | \tab As scibert_scivocab_*, but using the original BERT vocabulary. } 72 | } 73 | 74 | \examples{ 75 | \dontrun{ 76 | download_BERT_checkpoint("bert_base_uncased") 77 | download_BERT_checkpoint("bert_large_uncased") 78 | temp_dir <- tempdir() 79 | download_BERT_checkpoint("bert_base_uncased", dir = temp_dir) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /man/dropout.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{dropout} 4 | \alias{dropout} 5 | \title{Perform Dropout} 6 | \usage{ 7 | dropout(input_tensor, dropout_prob = NULL) 8 | } 9 | \arguments{ 10 | \item{input_tensor}{Float Tensor to perform dropout on.} 11 | 12 | \item{dropout_prob}{A double giving the probability of dropping out a value 13 | (NOT of KEEPING a dimension as in `tf.nn.dropout`).} 14 | } 15 | \value{ 16 | A version of `input_tensor` with dropout applied. 17 | } 18 | \description{ 19 | Perform Dropout 20 | } 21 | \examples{ 22 | \dontrun{ 23 | tfx <- tensorflow::tf$get_variable("none", tensorflow::shape(10L)) 24 | dropout(tfx, 0.5) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/embedding_lookup.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{embedding_lookup} 4 | \alias{embedding_lookup} 5 | \title{Look up words embeddings for id tensor} 6 | \usage{ 7 | embedding_lookup( 8 | input_ids, 9 | vocab_size, 10 | embedding_size = 128L, 11 | initializer_range = 0.02, 12 | word_embedding_name = "word_embeddings" 13 | ) 14 | } 15 | \arguments{ 16 | \item{input_ids}{Integer Tensor of shape [batch_size, seq_length] containing 17 | word ids.} 18 | 19 | \item{vocab_size}{Size of the embedding vocabulary (integer).} 20 | 21 | \item{embedding_size}{Width of the word embeddings (integer).} 22 | 23 | \item{initializer_range}{Embedding initialization range (float).} 24 | 25 | \item{word_embedding_name}{Name of the embedding table (character).} 26 | } 27 | \value{ 28 | Float Tensor of shape [batch_size, seq_length, embedding_size], along 29 | with the embedding table in a list. 30 | } 31 | \description{ 32 | Look up words embeddings for id tensor 33 | } 34 | \examples{ 35 | \dontrun{ 36 | with( 37 | tensorflow::tf$variable_scope("examples", 38 | reuse = tensorflow::tf$AUTO_REUSE 39 | ), 40 | ids <- tensorflow::tf$get_variable("x", 41 | dtype = "int32", 42 | shape = tensorflow::shape(10, 20) 43 | ) 44 | ) 45 | embedding_lookup(ids, vocab_size = 100, word_embedding_name = "some_name") 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /man/embedding_postprocessor.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{embedding_postprocessor} 4 | \alias{embedding_postprocessor} 5 | \title{Perform various post-processing on a word embedding tensor} 6 | \usage{ 7 | embedding_postprocessor( 8 | input_tensor, 9 | use_token_type = FALSE, 10 | token_type_ids = NULL, 11 | token_type_vocab_size = 16L, 12 | token_type_embedding_name = "token_type_embeddings", 13 | use_position_embeddings = TRUE, 14 | position_embedding_name = "position_embeddings", 15 | initializer_range = 0.02, 16 | max_position_embeddings = 512L, 17 | dropout_prob = 0.1 18 | ) 19 | } 20 | \arguments{ 21 | \item{input_tensor}{Float Tensor of shape \code{[batch_size, seq_length, 22 | embedding_size]}.} 23 | 24 | \item{use_token_type}{Logical; whether to add embeddings for 25 | \code{token_type_ids}.} 26 | 27 | \item{token_type_ids}{(optional) Integer Tensor of shape \code{[batch_size, 28 | seq_length]}. Must be specified if \code{use_token_type} is TRUE} 29 | 30 | \item{token_type_vocab_size}{Integer; the vocabulary size of 31 | \code{token_type_ids}. This defaults to 16 (here and in BERT code), but 32 | must be set to 2 for compatibility with saved checkpoints.} 33 | 34 | \item{token_type_embedding_name}{Character; the name of the embedding table 35 | variable for token type ids.} 36 | 37 | \item{use_position_embeddings}{Logical; whether to add position embeddings 38 | for the position of each token in the sequence.} 39 | 40 | \item{position_embedding_name}{Character; the name of the embedding table 41 | variable for positional embeddings.} 42 | 43 | \item{initializer_range}{Numeric; range of the weight initialization.} 44 | 45 | \item{max_position_embeddings}{Integer; maximum sequence length that might 46 | ever be used with this model. This can be longer than the sequence length 47 | of input_tensor, but cannot be shorter.} 48 | 49 | \item{dropout_prob}{Numeric; dropout probability applied to the final output 50 | tensor.} 51 | } 52 | \value{ 53 | Float Tensor with same shape as \code{input_tensor}. 54 | } 55 | \description{ 56 | This function (optionally) adds to the word embeddings additional embeddings 57 | for token type and position. 58 | } 59 | \details{ 60 | See figure 2 in the BERT paper: 61 | 62 | \url{https://arxiv.org/pdf/1810.04805.pdf} 63 | 64 | Both type and position embeddings are learned model variables. Note that 65 | token "type" is essentially a sentence identifier, indicating which sentence 66 | (or, more generally, piece of text) the token belongs to. 67 | } 68 | \examples{ 69 | \dontrun{ 70 | batch_size <- 10 71 | seq_length <- 512 72 | embedding_size <- 200 73 | with(tensorflow::tf$variable_scope("examples", 74 | reuse = tensorflow::tf$AUTO_REUSE 75 | ), { 76 | input_tensor <- tensorflow::tf$get_variable( 77 | "input", 78 | dtype = "float", 79 | shape = tensorflow::shape(batch_size, seq_length, embedding_size) 80 | ) 81 | token_type_ids <- tensorflow::tf$get_variable( 82 | "ids", 83 | dtype = "int32", 84 | shape = tensorflow::shape(batch_size, seq_length) 85 | ) 86 | }) 87 | embedding_postprocessor(input_tensor, 88 | use_token_type = TRUE, 89 | token_type_ids = token_type_ids 90 | ) 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /man/extract_features.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_features.R 3 | \name{extract_features} 4 | \alias{extract_features} 5 | \title{Extract output features from BERT} 6 | \usage{ 7 | extract_features( 8 | examples, 9 | model = c("bert_base_uncased", "bert_base_cased", "bert_large_uncased", 10 | "bert_large_cased", "bert_large_uncased_wwm", "bert_large_cased_wwm", 11 | "bert_base_multilingual_cased", "bert_base_chinese", "scibert_scivocab_uncased", 12 | "scibert_scivocab_cased", "scibert_basevocab_uncased", "scibert_basevocab_cased"), 13 | ckpt_dir = NULL, 14 | vocab_file = find_vocab(ckpt_dir), 15 | bert_config_file = find_config(ckpt_dir), 16 | init_checkpoint = find_ckpt(ckpt_dir), 17 | output_file = NULL, 18 | max_seq_length = 128L, 19 | layer_indexes = -4:-1, 20 | batch_size = 2L, 21 | features = c("output", "attention"), 22 | verbose = FALSE 23 | ) 24 | } 25 | \arguments{ 26 | \item{examples}{List of \code{InputExample_EF} objects, or character 27 | vector(s) that can be converted to \code{InputExample_EF} objects.} 28 | 29 | \item{model}{Character; which model checkpoint to use. If specified, 30 | \code{ckpt_dir}, code{vocab_file}, \code{bert_config_file}, and 31 | \code{init_checkpoint} will be inferred. If you do not have this 32 | checkpoint, you will be prompted to download it in interactive mode.} 33 | 34 | \item{ckpt_dir}{Character; path to checkpoint directory. If specified, any 35 | other checkpoint files required by this function (\code{vocab_file}, 36 | \code{bert_config_file}, or \code{init_checkpoint}) will default to 37 | standard filenames within \code{ckpt_dir}.} 38 | 39 | \item{vocab_file}{path to vocabulary file. File is assumed to be a text file, 40 | with one token per line, with the line number corresponding to the index of 41 | that token in the vocabulary.} 42 | 43 | \item{bert_config_file}{Character; the path to a json config file.} 44 | 45 | \item{init_checkpoint}{Character; path to the checkpoint directory, plus 46 | checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and 47 | explicit, starting with "/".} 48 | 49 | \item{output_file}{(optional) Character; file path (stub) for writing output 50 | to.} 51 | 52 | \item{max_seq_length}{Integer; the maximum number of tokens that will be 53 | considered together.} 54 | 55 | \item{layer_indexes}{Integer vector; indexes (positive, or negative counting 56 | back from the end) indicating which layers to extract as "output features". 57 | The "zeroth" layer embeddings are the input embeddings vectors to the first 58 | layer.} 59 | 60 | \item{batch_size}{Integer; how many examples to process per batch.} 61 | 62 | \item{features}{Character; whether to return "output" (layer outputs, the 63 | default), "attention" (attention probabilities), or both.} 64 | 65 | \item{verbose}{Logical; if FALSE, suppresses most of the TensorFlow chatter 66 | by temporarily setting the logging threshold to its highest level. If TRUE, 67 | keeps the current logging threshold, which defaults to "WARN". To change 68 | the logging threshold of the current session, run 69 | \code{tensorflow::tf$logging$set_verbosity(tensorflow::tf$logging$DEBUG)} 70 | (setting whatever verbosity level you want).} 71 | } 72 | \value{ 73 | A list with elements "output" (the layer outputs as a tibble) and/or 74 | "attention" (the attention weights as a tibble). 75 | } 76 | \description{ 77 | Given example sentences (as a list of \code{InputExample_EF}s), apply an 78 | existing BERT model and capture certain output layers. (These could 79 | potentially be used as features in downstream tasks.) 80 | } 81 | \examples{ 82 | \dontrun{ 83 | BERT_PRETRAINED_DIR <- download_BERT_checkpoint("bert_base_uncased") 84 | examples <- c("I saw the branch on the bank.", 85 | "I saw the branch of the bank.") 86 | 87 | # Just specify checkpoint directory. 88 | feats <- extract_features( 89 | examples = examples, 90 | ckpt_dir = BERT_PRETRAINED_DIR 91 | ) 92 | # Can also just specify the model, if you have it downloaded. 93 | # In interactive mode, you'll be prompted to download the model if you do not 94 | # have it. 95 | feats <- extract_features( 96 | examples = examples, 97 | model = "bert_base_uncased" 98 | ) 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /man/figures/rbert_hex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/man/figures/rbert_hex.png -------------------------------------------------------------------------------- /man/file_based_convert_examples_to_features.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_classifier.R 3 | \name{file_based_convert_examples_to_features} 4 | \alias{file_based_convert_examples_to_features} 5 | \title{Convert a set of \code{InputExample}s to a TFRecord file.} 6 | \usage{ 7 | file_based_convert_examples_to_features( 8 | examples, 9 | label_list, 10 | max_seq_length, 11 | tokenizer, 12 | output_file 13 | ) 14 | } 15 | \arguments{ 16 | \item{examples}{List of \code{InputExample}s to convert.} 17 | 18 | \item{label_list}{Character (or integer?); possible labels for examples.} 19 | 20 | \item{max_seq_length}{Integer; the maximum number of tokens that will be 21 | considered together.} 22 | 23 | \item{tokenizer}{A tokenizer object to use (e.g. object of class 24 | FullTokenizer).} 25 | 26 | \item{output_file}{Character; path to file to write to.} 27 | } 28 | \value{ 29 | return value 30 | } 31 | \description{ 32 | description 33 | } 34 | -------------------------------------------------------------------------------- /man/file_based_input_fn_builder.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_classifier.R 3 | \name{file_based_input_fn_builder} 4 | \alias{file_based_input_fn_builder} 5 | \title{summary} 6 | \usage{ 7 | file_based_input_fn_builder(x) 8 | } 9 | \arguments{ 10 | \item{x}{This parameter will be described when this function is implemented.} 11 | } 12 | \value{ 13 | return value 14 | } 15 | \description{ 16 | description 17 | } 18 | -------------------------------------------------------------------------------- /man/find_files.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{find_files} 4 | \alias{find_files} 5 | \alias{find_vocab} 6 | \alias{find_config} 7 | \alias{find_ckpt} 8 | \title{Find Checkpoint Files} 9 | \usage{ 10 | find_vocab(ckpt_dir) 11 | 12 | find_config(ckpt_dir) 13 | 14 | find_ckpt(ckpt_dir) 15 | } 16 | \arguments{ 17 | \item{ckpt_dir}{Character; the path to the checkpoint directory. If this 18 | argument is NULL, the associated functions also return NULL.} 19 | } 20 | \description{ 21 | Given the path to a checkpoint directory, return the paths to certain files 22 | in that directory. 23 | } 24 | \section{Functions}{ 25 | \itemize{ 26 | \item \code{find_vocab}: Find the vocabulary file ('vocab.txt'). 27 | 28 | \item \code{find_config}: Find the config file ('bert_config.json'). 29 | 30 | \item \code{find_ckpt}: Find the checkpoint file stub (files begin with 31 | 'bert_model.ckpt'). 32 | }} 33 | 34 | -------------------------------------------------------------------------------- /man/gelu.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{gelu} 4 | \alias{gelu} 5 | \title{Gaussian Error Linear Unit} 6 | \usage{ 7 | gelu(x) 8 | } 9 | \arguments{ 10 | \item{x}{Float Tensor to perform activation on.} 11 | } 12 | \value{ 13 | `x` with the GELU activation applied. 14 | } 15 | \description{ 16 | This is a smoother version of the RELU. Original paper: 17 | https://arxiv.org/abs/1606.08415 18 | } 19 | \examples{ 20 | \dontrun{ 21 | tfx <- tensorflow::tf$get_variable("none", tensorflow::shape(10L)) 22 | gelu(tfx) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /man/get_activation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{get_activation} 4 | \alias{get_activation} 5 | \title{Map a string to a Python function} 6 | \usage{ 7 | get_activation(activation_string) 8 | } 9 | \arguments{ 10 | \item{activation_string}{String name of the activation function.} 11 | } 12 | \value{ 13 | A function corresponding to the activation function. If 14 | \code{activation_string} is NA, empty, or "linear", this will return NA. If 15 | \code{activation_string} is not a string, it will return 16 | \code{activation_string}. 17 | } 18 | \description{ 19 | Example: "relu" => `tensorflow::tf$nn$relu`. 20 | } 21 | \examples{ 22 | \dontrun{ 23 | get_activation("gelu") 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /man/get_assignment_map_from_checkpoint.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{get_assignment_map_from_checkpoint} 4 | \alias{get_assignment_map_from_checkpoint} 5 | \title{Compute the intersection of the current variables and checkpoint variables} 6 | \usage{ 7 | get_assignment_map_from_checkpoint(tvars, init_checkpoint) 8 | } 9 | \arguments{ 10 | \item{tvars}{List of training variables in the current model.} 11 | 12 | \item{init_checkpoint}{Character; path to the checkpoint directory, plus 13 | checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and 14 | explicit, starting with "/".} 15 | } 16 | \value{ 17 | List with two elements: the assignment map and the initialized 18 | variable names. The assignment map is a list of the "base" variable names 19 | that are in both the current computational graph and the checkpoint. The 20 | initialized variable names list contains both the base names and the base 21 | names + ":0". (This seems redundant to me. I assume it will make sense 22 | later. -JDB) 23 | } 24 | \description{ 25 | Returns the intersection (not the union, as python docs say -JDB) of the sets 26 | of variable names from the current graph and the checkpoint. 27 | } 28 | \details{ 29 | Note that a Tensorflow checkpoint is not the same as a saved model. A saved 30 | model contains a complete description of the computational graph and is 31 | sufficient to reconstruct the entire model, while a checkpoint contains just 32 | the parameter values (and variable names), and so requires a specification of 33 | the original model structure to reconstruct the computational graph. -JDB 34 | } 35 | \examples{ 36 | \dontrun{ 37 | # Just for illustration: create a "model" with a couple variables 38 | # that overlap some variable names in the BERT checkpoint. 39 | with(tensorflow::tf$variable_scope("bert", 40 | reuse = tensorflow::tf$AUTO_REUSE 41 | ), { 42 | test_ten1 <- tensorflow::tf$get_variable( 43 | "encoder/layer_9/output/dense/bias", 44 | shape = c(1L, 2L, 3L) 45 | ) 46 | test_ten2 <- tensorflow::tf$get_variable( 47 | "encoder/layer_9/output/dense/kernel", 48 | shape = c(1L, 2L, 3L) 49 | ) 50 | }) 51 | tvars <- tensorflow::tf$get_collection( 52 | tensorflow::tf$GraphKeys$GLOBAL_VARIABLES 53 | ) 54 | temp_dir <- tempdir() 55 | init_checkpoint <- file.path( 56 | temp_dir, 57 | "BERT_checkpoints", 58 | "uncased_L-12_H-768_A-12", 59 | "bert_model.ckpt" 60 | ) 61 | 62 | amap <- get_assignment_map_from_checkpoint(tvars, init_checkpoint) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /man/get_shape_list.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{get_shape_list} 4 | \alias{get_shape_list} 5 | \title{Return the shape of tensor} 6 | \usage{ 7 | get_shape_list(tensor, expected_rank = NULL, name = NULL) 8 | } 9 | \arguments{ 10 | \item{tensor}{A tf.Tensor object to find the shape of.} 11 | 12 | \item{expected_rank}{The expected rank of \code{tensor}, as an integer vector 13 | or list. If this is specified and the \code{tensor} has a rank not listed 14 | in \code{expected_rank}, an exception will be thrown.} 15 | 16 | \item{name}{Optional name of the tensor for the error message.} 17 | } 18 | \value{ 19 | A list of dimensions of the shape of tensor. All static dimensions 20 | will be returned as native integers, and dynamic dimensions will be 21 | returned as tf.Tensor scalars. (I'm not very comfortable with this 22 | behavior. It's not usually good practice to make the return type vary 23 | depending on the input.) 24 | } 25 | \description{ 26 | Returns a list of the shape of tensor, preferring static dimensions. (A 27 | static dimension is known at graph definition time, and a dynamic dimension 28 | is known only at graph execution time.) 29 | https://stackoverflow.com/questions/37096225/ 30 | } 31 | \examples{ 32 | \dontrun{ 33 | with(tensorflow::tf$variable_scope("examples", 34 | reuse = tensorflow::tf$AUTO_REUSE 35 | ), { 36 | phx <- tensorflow::tf$placeholder(tensorflow::tf$int32, shape = c(4)) 37 | get_shape_list(phx) # static 38 | tfu <- tensorflow::tf$unique(phx) 39 | tfy <- tfu$y 40 | get_shape_list(tfy) # dynamic 41 | }) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /man/input_fn_builder.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_classifier.R 3 | \name{input_fn_builder} 4 | \alias{input_fn_builder} 5 | \title{Create an \code{input_fn} closure to be passed to TPUEstimator} 6 | \usage{ 7 | input_fn_builder(features, seq_length, is_training, drop_remainder) 8 | } 9 | \arguments{ 10 | \item{features}{A list of features (objects of class \code{InputFeatures}).} 11 | 12 | \item{seq_length}{Integer; the maximum length (number of tokens) of each 13 | example. (Examples should already be padded to this length by this point.)} 14 | 15 | \item{is_training}{Logical; whether these are training examples.} 16 | 17 | \item{drop_remainder}{Logical; whether to drop the extra if the number of 18 | elements in the dataset is not an exact multiple of the batch size,} 19 | } 20 | \value{ 21 | An \code{input_fn} closure to be passed to TPUEstimator. 22 | } 23 | \description{ 24 | Creates an \code{input_fn} closure to be passed to TPUEstimator. The output 25 | of this closure is the (modified) output of 26 | \code{tensorflow::tf$data$Dataset$from_tensor_slices} (an object of class 27 | "tensorflow.python.data.ops.dataset_ops.BatchDataset"). 28 | } 29 | \examples{ 30 | \dontrun{ 31 | tokenizer <- FullTokenizer("vocab.txt") 32 | seq_len <- 15L 33 | input_ex1 <- InputExample( 34 | guid = 1L, 35 | text_a = "Some text to classify.", 36 | text_b = "More wordy words.", 37 | label = "good" 38 | ) 39 | input_ex2 <- InputExample( 40 | guid = 2L, 41 | text_a = "This is another example.", 42 | text_b = "So many words.", 43 | label = "bad" 44 | ) 45 | feat <- convert_examples_to_features( 46 | examples = list(input_ex1, input_ex2), 47 | label_list = c("good", "bad"), 48 | max_seq_length = seq_len, 49 | tokenizer = tokenizer 50 | ) 51 | input_fn <- input_fn_builder( 52 | features = feat, 53 | seq_length = seq_len, 54 | is_training = TRUE, 55 | drop_remainder = FALSE 56 | ) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /man/input_fn_builder_EF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_features.R 3 | \name{input_fn_builder_EF} 4 | \alias{input_fn_builder_EF} 5 | \title{Create an \code{input_fn} closure to be passed to TPUEstimator} 6 | \usage{ 7 | input_fn_builder_EF(features, seq_length) 8 | } 9 | \arguments{ 10 | \item{features}{A list of features (objects of class 11 | \code{InputFeatures_EF}).} 12 | 13 | \item{seq_length}{Integer; the maximum length (number of tokens) of each 14 | example. (Examples should already be padded to this length by this point.)} 15 | } 16 | \value{ 17 | An \code{input_fn} closure to be passed to TPUEstimator. 18 | } 19 | \description{ 20 | Creates an \code{input_fn} closure to be passed to TPUEstimator. The output 21 | of this closure is the (modified) output of 22 | \code{tensorflow::tf$data$Dataset$from_tensor_slices} (an object of class 23 | "tensorflow.python.data.ops.dataset_ops.BatchDataset"). This function is 24 | similar to \code{input_fn_builder} from run_classifier.R. (RBERT issue #28.) 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/is_chinese_char.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{is_chinese_char} 4 | \alias{is_chinese_char} 5 | \title{Check whether cp is the codepoint of a CJK character.} 6 | \usage{ 7 | is_chinese_char(cp) 8 | } 9 | \arguments{ 10 | \item{cp}{A unicode codepoint, as an integer.} 11 | } 12 | \value{ 13 | Logical TRUE if cp is codepoint of a CJK character. 14 | } 15 | \description{ 16 | (R implementation of BasicTokenizer._is_chinese_char from 17 | BERT: tokenization.py. From that file: 18 | This defines a "chinese character" as anything in the CJK Unicode block: 19 | https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 20 | } 21 | \details{ 22 | Note that the CJK Unicode block is NOT all Japanese and Korean characters, 23 | despite its name. The modern Korean Hangul alphabet is a different block, 24 | as is Japanese Hiragana and Katakana. Those alphabets are used to write 25 | space-separated words, so they are not treated specially and are handled 26 | like the alphabets of the other languages.) 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/is_control.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{is_control} 4 | \alias{is_control} 5 | \title{Check whether `char` is a control character.} 6 | \usage{ 7 | is_control(char) 8 | } 9 | \arguments{ 10 | \item{char}{A character scalar, comprising a single unicode character.} 11 | } 12 | \value{ 13 | TRUE if char is a control character. 14 | } 15 | \description{ 16 | (R implementation of _is_control from BERT: tokenization.py.) 17 | } 18 | \details{ 19 | "\\t", "\\n", and "\\r" are technically control characters but we treat them 20 | as whitespace since they are generally considered as such. 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/is_punctuation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{is_punctuation} 4 | \alias{is_punctuation} 5 | \title{Check whether `char` is a punctuation character.} 6 | \usage{ 7 | is_punctuation(char) 8 | } 9 | \arguments{ 10 | \item{char}{A character scalar, comprising a single unicode character.} 11 | } 12 | \value{ 13 | TRUE if char is a punctuation character. 14 | } 15 | \description{ 16 | (R implementation of _is_punctuation from BERT: tokenization.py.) 17 | } 18 | \details{ 19 | We treat all non-letter/number ASCII as punctuation. 20 | Characters such as "^", "$", and "`" are not in the Unicode 21 | Punctuation class but we treat them as punctuation anyway, for 22 | consistency. 23 | } 24 | \keyword{internal} 25 | -------------------------------------------------------------------------------- /man/is_whitespace.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{is_whitespace} 4 | \alias{is_whitespace} 5 | \title{Check whether `char` is a whitespace character.} 6 | \usage{ 7 | is_whitespace(char) 8 | } 9 | \arguments{ 10 | \item{char}{A character scalar, comprising a single unicode character.} 11 | } 12 | \value{ 13 | TRUE if char is a whitespace character. 14 | } 15 | \description{ 16 | (R implementation of _is_whitespace from BERT: tokenization.py.) 17 | } 18 | \details{ 19 | "\\t", "\\n", and "\\r" are technically control characters but we treat them 20 | as whitespace since they are generally considered as such. 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/layer_norm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{layer_norm} 4 | \alias{layer_norm} 5 | \title{Run layer normalization} 6 | \usage{ 7 | layer_norm(input_tensor, name = NULL) 8 | } 9 | \arguments{ 10 | \item{input_tensor}{Tensor to perform layor normalization on.} 11 | 12 | \item{name}{Optional variable_scope for layer_norm.} 13 | } 14 | \value{ 15 | A Tensor of the same shape and type as `input_tensor`, with 16 | normalization applied. 17 | } 18 | \description{ 19 | Run layer normalization on the last dimension of the tensor. 20 | } 21 | \details{ 22 | Wrapper around tensorflow layer_norm function. From tensorflow documentation: 23 | Adds a Layer Normalization layer. Based on the paper: 24 | \url{https://arxiv.org/abs/1607.06450}. 25 | 26 | Note: \code{begin_norm_axis}: The first normalization dimension: 27 | normalization will be performed along dimensions (begin_norm_axis : 28 | rank(inputs) ) 29 | 30 | \code{begin_params_axis}: The first parameter (beta, gamma) dimension: scale 31 | and centering parameters will have dimensions (begin_params_axis : 32 | rank(inputs) ) and will be broadcast with the normalized inputs accordingly. 33 | } 34 | \examples{ 35 | \dontrun{ 36 | tfx <- tensorflow::tf$get_variable("example", tensorflow::shape(10L)) 37 | layer_norm(tfx) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /man/layer_norm_and_dropout.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{layer_norm_and_dropout} 4 | \alias{layer_norm_and_dropout} 5 | \title{Run layer normalization followed by dropout} 6 | \usage{ 7 | layer_norm_and_dropout(input_tensor, dropout_prob = NULL, name = NULL) 8 | } 9 | \arguments{ 10 | \item{input_tensor}{Float Tensor to perform layer_norm and dropout on.} 11 | 12 | \item{dropout_prob}{A double describing the probability of dropping out a 13 | value (NOT of KEEPING a dimension as in `tf.nn.dropout`).} 14 | 15 | \item{name}{Optional variable_scope for layer_norm.} 16 | } 17 | \value{ 18 | Tensor resulting from applying layer_norm and dropout to 19 | \code{input_tensor}. 20 | } 21 | \description{ 22 | Run layer normalization followed by dropout 23 | } 24 | \examples{ 25 | \dontrun{ 26 | tfx <- tensorflow::tf$get_variable("example2", tensorflow::shape(10L)) 27 | layer_norm_and_dropout(tfx, dropout_prob = 0.5) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /man/load_vocab.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{load_vocab} 4 | \alias{load_vocab} 5 | \title{Load a vocabulary file} 6 | \usage{ 7 | load_vocab(vocab_file) 8 | } 9 | \arguments{ 10 | \item{vocab_file}{path to vocabulary file. File is assumed to be a text 11 | file, with one token per line, with the line number corresponding to the 12 | index of that token in the vocabulary.} 13 | } 14 | \value{ 15 | In the BERT Python code, the vocab is returned as an OrderedDict 16 | from the collections package. Here we return the vocab as a named integer 17 | vector. Names are tokens in vocabulary, values are integer indices. 18 | } 19 | \description{ 20 | Load a vocabulary file 21 | } 22 | \examples{ 23 | \dontrun{ 24 | vocab <- load_vocab(vocab_file = "vocab.txt") 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/make_examples_simple.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_features.R 3 | \name{make_examples_simple} 4 | \alias{make_examples_simple} 5 | \title{Easily make examples for BERT} 6 | \usage{ 7 | make_examples_simple(seq_list) 8 | } 9 | \arguments{ 10 | \item{seq_list}{Character vector or list; text to turn into examples.} 11 | } 12 | \value{ 13 | A list of \code{InputExample_EF} objects. 14 | } 15 | \description{ 16 | A simple wrapper function to turn a list of text (as a character 17 | vector or list) into a list of examples suitable for use with RBERT. If the 18 | input is a flat list or vector of characters, the examples will be 19 | single-segment, with NULL for the second segment. If the input contains 20 | length-2 sublists or vectors, those examples will be two-segment sequences, 21 | e.g. for doing sentence-pair classification. 22 | } 23 | \examples{ 24 | input_ex <- make_examples_simple(c( 25 | "Here are some words.", 26 | "Here are some more words." 27 | )) 28 | input_ex2 <- make_examples_simple(list( 29 | c( 30 | "First sequence, first segment.", 31 | "First sequence, second segment." 32 | ), 33 | c( 34 | "Second sequence, first segment.", 35 | "Second sequence, second segment." 36 | ) 37 | )) 38 | } 39 | -------------------------------------------------------------------------------- /man/model_fn_builder.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_classifier.R 3 | \name{model_fn_builder} 4 | \alias{model_fn_builder} 5 | \title{Define \code{model_fn} closure for \code{TPUEstimator}} 6 | \usage{ 7 | model_fn_builder( 8 | bert_config, 9 | num_labels, 10 | init_checkpoint, 11 | learning_rate, 12 | num_train_steps, 13 | num_warmup_steps, 14 | use_tpu 15 | ) 16 | } 17 | \arguments{ 18 | \item{bert_config}{\code{BertConfig} instance.} 19 | 20 | \item{num_labels}{Integer; number of classification labels.} 21 | 22 | \item{init_checkpoint}{Character; path to the checkpoint directory, plus 23 | checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and 24 | explicit, starting with "/".} 25 | 26 | \item{learning_rate}{Numeric; the learning rate.} 27 | 28 | \item{num_train_steps}{Integer; number of steps to train for.} 29 | 30 | \item{num_warmup_steps}{Integer; number of steps to use for "warm-up".} 31 | 32 | \item{use_tpu}{Logical; whether to use TPU.} 33 | } 34 | \value{ 35 | \code{model_fn} closure for \code{TPUEstimator}. 36 | } 37 | \description{ 38 | Returns \code{model_fn} closure, which is an input to \code{TPUEstimator}. 39 | } 40 | \details{ 41 | The \code{model_fn} function takes four parameters: \describe{ 42 | \item{features}{A list (or similar structure) that contains objects such as 43 | \code{input_ids}, \code{input_mask}, \code{segment_ids}, and 44 | \code{label_ids}. These objects will be inputs to the \code{create_model} 45 | function.} 46 | \item{labels}{Not used in this function, but presumably we need to 47 | keep this slot here.} 48 | \item{mode}{Character; value such as "train", "infer", 49 | or "eval".} 50 | \item{params}{Not used in this function, but presumably we need 51 | to keep this slot here.} 52 | } 53 | 54 | The output of \code{model_fn} is the result of a 55 | \code{tf$contrib$tpu$TPUEstimatorSpec} call. 56 | 57 | This reference may be helpful: 58 | \url{https://tensorflow.rstudio.com/tfestimators/articles/creating_estimators.html} 59 | } 60 | \examples{ 61 | \dontrun{ 62 | with(tensorflow::tf$variable_scope("examples", 63 | reuse = tensorflow::tf$AUTO_REUSE 64 | ), { 65 | input_ids <- tensorflow::tf$constant(list( 66 | list(31L, 51L, 99L), 67 | list(15L, 5L, 0L) 68 | )) 69 | 70 | input_mask <- tensorflow::tf$constant(list( 71 | list(1L, 1L, 1L), 72 | list(1L, 1L, 0L) 73 | )) 74 | token_type_ids <- tensorflow::tf$constant(list( 75 | list(0L, 0L, 1L), 76 | list(0L, 2L, 0L) 77 | )) 78 | config <- BertConfig( 79 | vocab_size = 30522L, 80 | hidden_size = 768L, 81 | num_hidden_layers = 8L, 82 | type_vocab_size = 2L, 83 | num_attention_heads = 12L, 84 | intermediate_size = 3072L 85 | ) 86 | 87 | temp_dir <- tempdir() 88 | init_checkpoint <- file.path( 89 | temp_dir, 90 | "BERT_checkpoints", 91 | "uncased_L-12_H-768_A-12", 92 | "bert_model.ckpt" 93 | ) 94 | 95 | example_mod_fn <- model_fn_builder( 96 | bert_config = config, 97 | num_labels = 2L, 98 | init_checkpoint = init_checkpoint, 99 | learning_rate = 0.01, 100 | num_train_steps = 20L, 101 | num_warmup_steps = 10L, 102 | use_tpu = FALSE 103 | ) 104 | }) 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /man/reshape_from_matrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{reshape_from_matrix} 4 | \alias{reshape_from_matrix} 5 | \title{Turn a matrix into a tensor} 6 | \usage{ 7 | reshape_from_matrix(output_tensor, orig_shape_list) 8 | } 9 | \arguments{ 10 | \item{output_tensor}{Tensor to reshape. What a lousy name for an input.} 11 | 12 | \item{orig_shape_list}{Shape to cast Tensor into.} 13 | } 14 | \value{ 15 | The Tensor reshaped to rank specified by orig_shape_list. 16 | } 17 | \description{ 18 | Reshapes a rank 2 tensor back to its original rank >= 2 tensor. The final 19 | dimension ('width') of the tensor is assumed to be preserved. If a different 20 | width is requested, function will complain. 21 | } 22 | \examples{ 23 | \dontrun{ 24 | with( 25 | tensorflow::tf$variable_scope("examples", 26 | reuse = tensorflow::tf$AUTO_REUSE 27 | ), 28 | r2t <- tensorflow::tf$get_variable("r2t", 29 | dtype = "int32", 30 | shape = c(10, 20) 31 | ) 32 | ) 33 | reshape_from_matrix(r2t, orig_shape_list = c(5L, 2L, 20L)) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /man/reshape_to_matrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{reshape_to_matrix} 4 | \alias{reshape_to_matrix} 5 | \title{Turn a tensor into a matrix} 6 | \usage{ 7 | reshape_to_matrix(input_tensor) 8 | } 9 | \arguments{ 10 | \item{input_tensor}{Tensor to reshape.} 11 | } 12 | \value{ 13 | The Tensor reshaped to rank 2. 14 | } 15 | \description{ 16 | Reshapes a >= rank 2 tensor to a rank 2 tensor. The last dimension is 17 | preserved; the rest are flattened. 18 | } 19 | \examples{ 20 | \dontrun{ 21 | with( 22 | tensorflow::tf$variable_scope("examples", 23 | reuse = tensorflow::tf$AUTO_REUSE 24 | ), 25 | r3t <- tensorflow::tf$get_variable("r3t", 26 | dtype = "int32", 27 | shape = c(10, 20, 3) 28 | ) 29 | ) 30 | reshape_to_matrix(r3t) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /man/set_BERT_dir.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download_checkpoint.R 3 | \name{set_BERT_dir} 4 | \alias{set_BERT_dir} 5 | \title{Set the directory for BERT checkpoints} 6 | \usage{ 7 | set_BERT_dir(dir) 8 | } 9 | \arguments{ 10 | \item{dir}{Character vector. Destination directory for checkpoints. Leave 11 | \code{NULL} to allow RBERT to automatically choose a directory. The path is 12 | determined from the \code{dir} parameter if supplied, followed by the 13 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT" 14 | folder in the user cache directory (determined using 15 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the 16 | `RBERT.dir` option will be updated to that location. Note that the 17 | checkpoint will create a subdirectory inside this \code{dir}.} 18 | } 19 | \value{ 20 | A list with the previous value of `BERT.dir` (invisibly). 21 | } 22 | \description{ 23 | Set a given \code{dir} as the default BERT checkpoint directory for this 24 | session, and create it if it does not exist. 25 | } 26 | \examples{ 27 | \dontrun{ 28 | set_BERT_dir("fake_dir") 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /man/split_on_punc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{split_on_punc} 4 | \alias{split_on_punc} 5 | \title{Split text on punctuation.} 6 | \usage{ 7 | split_on_punc(text) 8 | } 9 | \arguments{ 10 | \item{text}{A character scalar, encoded as utf-8.} 11 | } 12 | \value{ 13 | The input text as a character vector, split on punctuation 14 | characters. 15 | } 16 | \description{ 17 | (R implementation of BasicTokenizer._run_split_on_punc from 18 | BERT: tokenization.py.) 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/strip_accents.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{strip_accents} 4 | \alias{strip_accents} 5 | \title{Strip accents from a piece of text.} 6 | \usage{ 7 | strip_accents(text) 8 | } 9 | \arguments{ 10 | \item{text}{A character scalar, encoded as utf-8.} 11 | } 12 | \value{ 13 | text with accents removed. 14 | } 15 | \description{ 16 | (R implementation of BasicTokenizer._run_strip_accents from 17 | BERT: tokenization.py.) 18 | } 19 | \keyword{internal} 20 | -------------------------------------------------------------------------------- /man/tokenize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{tokenize} 4 | \alias{tokenize} 5 | \alias{tokenize.FullTokenizer} 6 | \alias{tokenize.BasicTokenizer} 7 | \alias{tokenize.WordpieceTokenizer} 8 | \title{Tokenizers for various objects.} 9 | \usage{ 10 | tokenize(tokenizer, text) 11 | 12 | \method{tokenize}{FullTokenizer}(tokenizer, text) 13 | 14 | \method{tokenize}{BasicTokenizer}(tokenizer, text) 15 | 16 | \method{tokenize}{WordpieceTokenizer}(tokenizer, text) 17 | } 18 | \arguments{ 19 | \item{tokenizer}{The Tokenizer object to refer to.} 20 | 21 | \item{text}{The text to tokenize. For tokenize.WordpieceTokenizer, the text 22 | should have already been passed through BasicTokenizer.} 23 | } 24 | \value{ 25 | A list of tokens. 26 | } 27 | \description{ 28 | This tokenizer performs some basic cleaning, then splits up text on 29 | whitespace and punctuation. 30 | } 31 | \section{Methods (by class)}{ 32 | \itemize{ 33 | \item \code{FullTokenizer}: Tokenizer method for objects of FullTokenizer class. 34 | 35 | \item \code{BasicTokenizer}: Tokenizer method for objects of BasicTokenizer class. 36 | 37 | \item \code{WordpieceTokenizer}: Tokenizer method for objects of WordpieceTokenizer 38 | class. This uses a greedy longest-match-first algorithm to perform 39 | tokenization using the given vocabulary. For example: input = "unaffable" 40 | output = list("un", "##aff", "##able") ... although, ironically, the BERT 41 | vocabulary actually gives output = list("una", "##ffa", "##ble") for this 42 | example, even though they use it as an example in their code. 43 | }} 44 | 45 | \examples{ 46 | \dontrun{ 47 | tokenizer <- FullTokenizer("vocab.txt", TRUE) 48 | tokenize(tokenizer, text = "a bunch of words") 49 | } 50 | \dontrun{ 51 | tokenizer <- BasicTokenizer(TRUE) 52 | tokenize(tokenizer, text = "a bunch of words") 53 | } 54 | \dontrun{ 55 | vocab <- load_vocab(vocab_file = "vocab.txt") 56 | tokenizer <- WordpieceTokenizer(vocab) 57 | tokenize(tokenizer, text = "a bunch of words") 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /man/tokenize_chinese_chars.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{tokenize_chinese_chars} 4 | \alias{tokenize_chinese_chars} 5 | \title{Add whitespace around any CJK character.} 6 | \usage{ 7 | tokenize_chinese_chars(text) 8 | } 9 | \arguments{ 10 | \item{text}{A character scalar.} 11 | } 12 | \value{ 13 | Text with spaces around CJK characters. 14 | } 15 | \description{ 16 | (R implementation of BasicTokenizer._tokenize_chinese_chars from 17 | BERT: tokenization.py.) This may result in doubled-up spaces, 18 | but that's the behavior of the python code... 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/tokenize_text.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{tokenize_text} 4 | \alias{tokenize_text} 5 | \title{Tokenize Text with Word Pieces} 6 | \usage{ 7 | tokenize_text( 8 | text, 9 | ckpt_dir = NULL, 10 | vocab_file = find_vocab(ckpt_dir), 11 | include_special = TRUE 12 | ) 13 | } 14 | \arguments{ 15 | \item{text}{Character vector; text to tokenize.} 16 | 17 | \item{ckpt_dir}{Character; path to checkpoint directory. If specified, any 18 | other checkpoint files required by this function (\code{vocab_file}, 19 | \code{bert_config_file}, or \code{init_checkpoint}) will default to 20 | standard filenames within \code{ckpt_dir}.} 21 | 22 | \item{vocab_file}{path to vocabulary file. File is assumed to be a text file, 23 | with one token per line, with the line number corresponding to the index of 24 | that token in the vocabulary.} 25 | 26 | \item{include_special}{Logical; whether to add the special tokens "[CLS]" (at 27 | the beginning) and "[SEP]" (at the end) of the token list.} 28 | } 29 | \value{ 30 | A list of character vectors, giving the tokenization of the input 31 | text. 32 | } 33 | \description{ 34 | Given some text and a word piece vocabulary, tokenizes the text. This is 35 | primarily a tool for quickly checking the tokenization of a piece of text. 36 | } 37 | \examples{ 38 | \dontrun{ 39 | BERT_PRETRAINED_DIR <- download_BERT_checkpoint("bert_base_uncased") 40 | tokens <- tokenize_text( 41 | text = c("Who doesn't like tacos?", "Not me!"), 42 | ckpt_dir = BERT_PRETRAINED_DIR 43 | ) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /man/tokenize_word.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{tokenize_word} 4 | \alias{tokenize_word} 5 | \title{Tokenize a single "word" (no whitespace).} 6 | \usage{ 7 | tokenize_word(word, vocab, unk_token = "[UNK]", max_chars = 100) 8 | } 9 | \arguments{ 10 | \item{word}{Word to tokenize.} 11 | 12 | \item{vocab}{Character vector containing vocabulary words} 13 | 14 | \item{unk_token}{Token to represent unknown words.} 15 | 16 | \item{max_chars}{Maximum length of word recognized.} 17 | } 18 | \value{ 19 | Input word as a list of tokens. 20 | } 21 | \description{ 22 | In BERT: tokenization.py, 23 | this code is inside the tokenize method for WordpieceTokenizer objects. 24 | I've moved it into its own function for clarity. 25 | Punctuation should already have been removed from the word. 26 | } 27 | \examples{ 28 | tokenize_word("unknown", vocab = c("un" = 0, "##known" = 1)) 29 | tokenize_word("known", vocab = c("un" = 0, "##known" = 1)) 30 | } 31 | -------------------------------------------------------------------------------- /man/transformer_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{transformer_model} 4 | \alias{transformer_model} 5 | \title{Build multi-head, multi-layer Transformer} 6 | \usage{ 7 | transformer_model( 8 | input_tensor, 9 | attention_mask = NULL, 10 | hidden_size = 768L, 11 | num_hidden_layers = 12L, 12 | num_attention_heads = 12L, 13 | intermediate_size = 3072L, 14 | intermediate_act_fn = gelu, 15 | hidden_dropout_prob = 0.1, 16 | attention_probs_dropout_prob = 0.1, 17 | initializer_range = 0.02, 18 | do_return_all_layers = FALSE 19 | ) 20 | } 21 | \arguments{ 22 | \item{input_tensor}{Float Tensor of shape \code{[batch_size, seq_length, 23 | hidden_size]}.} 24 | 25 | \item{attention_mask}{(Optional) Integer Tensor of shape \code{batch_size, 26 | seq_length, seq_length}, with 1 for positions that can be attended to and 0 27 | in positions that should not be.} 28 | 29 | \item{hidden_size}{Integer; hidden size of the Transformer.} 30 | 31 | \item{num_hidden_layers}{Integer; number of layers (blocks) in the 32 | Transformer.} 33 | 34 | \item{num_attention_heads}{Integer; number of attention heads in the 35 | Transformer.} 36 | 37 | \item{intermediate_size}{Integer; the size of the "intermediate" (a.k.a., 38 | feed forward) layer.} 39 | 40 | \item{intermediate_act_fn}{The non-linear activation function to apply to the 41 | output of the intermediate/feed-forward layer. (Function, not character.)} 42 | 43 | \item{hidden_dropout_prob}{Numeric; the dropout probability for the hidden 44 | layers.} 45 | 46 | \item{attention_probs_dropout_prob}{Numeric; the dropout probability of the 47 | attention probabilities.} 48 | 49 | \item{initializer_range}{Numeric; the range of the initializer (stddev of 50 | truncated normal).} 51 | 52 | \item{do_return_all_layers}{Logical; whether to also return all layers or 53 | just the final layer. If this is TRUE, will also return attention 54 | probabilities.} 55 | } 56 | \value{ 57 | float Tensor of shape \code{[batch_size, seq_length, hidden_size]}, 58 | the final hidden layer of the Transformer. Or if `do_return_all_layers` is 59 | `TRUE`, a list of such Tensors (one for each hidden layer). 60 | } 61 | \description{ 62 | Multi-headed, multi-layer Transformer from "Attention is All You Need". This 63 | is almost an exact implementation of the original Transformer encoder. 64 | } 65 | \details{ 66 | See the original paper: \url{https://arxiv.org/abs/1706.03762} 67 | 68 | Also see: 69 | \url{https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py} 70 | } 71 | \examples{ 72 | \dontrun{ 73 | batch_size <- 10 74 | seq_length <- 500 75 | hidden_size <- 120 76 | 77 | with(tensorflow::tf$variable_scope("examples", 78 | reuse = tensorflow::tf$AUTO_REUSE 79 | ), { 80 | input_tensor <- tensorflow::tf$get_variable("input", 81 | shape = c( 82 | batch_size, 83 | seq_length, 84 | hidden_size 85 | ) 86 | ) 87 | }) 88 | 89 | model_t <- transformer_model( 90 | input_tensor = input_tensor, 91 | hidden_size = hidden_size 92 | ) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /man/transpose_for_scores.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeling.R 3 | \name{transpose_for_scores} 4 | \alias{transpose_for_scores} 5 | \title{Reshape and transpose tensor} 6 | \usage{ 7 | transpose_for_scores( 8 | input_tensor, 9 | batch_size, 10 | num_attention_heads, 11 | seq_length, 12 | width 13 | ) 14 | } 15 | \arguments{ 16 | \item{input_tensor}{Tensor to reshape and transpose.} 17 | 18 | \item{batch_size}{Size of the first dimension of input_tensor.} 19 | 20 | \item{num_attention_heads}{Size of the third dimension of input_tensor. (Will 21 | be transposed to second dimension.)} 22 | 23 | \item{seq_length}{Size of the second dimension of input_tensor. (Will be 24 | transposed to third dimension.)} 25 | 26 | \item{width}{Size of fourth dimension of input_tensor.} 27 | } 28 | \value{ 29 | Tensor of shape: batch_size, num_attention_heads, seq_length, width. 30 | } 31 | \description{ 32 | In Python code, this is internal to attention_layer. Pulling it out into 33 | separate function here. 34 | } 35 | \keyword{internal} 36 | -------------------------------------------------------------------------------- /man/truncate_seq_pair.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_classifier.R 3 | \name{truncate_seq_pair} 4 | \alias{truncate_seq_pair} 5 | \title{Truncate a sequence pair to the maximum length.} 6 | \usage{ 7 | truncate_seq_pair(tokens_a, tokens_b, max_length) 8 | } 9 | \arguments{ 10 | \item{tokens_a}{Character; a vector of tokens in the first input sequence.} 11 | 12 | \item{tokens_b}{Character; a vector of tokens in the second input sequence.} 13 | 14 | \item{max_length}{Integer; the maximum total length of the two sequences.} 15 | } 16 | \value{ 17 | A list containing two character vectors: trunc_a and trunc_b. 18 | } 19 | \description{ 20 | Truncates a sequence pair to the maximum length. 21 | This is a simple heuristic which will always truncate the longer sequence one 22 | token at a time (or the first sequence in case of a tie -JDB). This makes 23 | more sense than truncating an equal percent of tokens from each, since if one 24 | sequence is very short then each token that's truncated likely contains more 25 | information than a longer sequence. 26 | } 27 | \details{ 28 | The python code truncated the sequences in place, using the pass-by-reference 29 | functionality of python. In R, we return the truncated sequences in a list. 30 | } 31 | \examples{ 32 | \dontrun{ 33 | tokens_a <- c("a", "b", "c", "d") 34 | tokens_b <- c("w", "x", "y", "z") 35 | truncate_seq_pair(tokens_a, tokens_b, 5) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /man/whitespace_tokenize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tokenization.R 3 | \name{whitespace_tokenize} 4 | \alias{whitespace_tokenize} 5 | \title{Run basic whitespace cleaning and splitting on a piece of text.} 6 | \usage{ 7 | whitespace_tokenize(text) 8 | } 9 | \arguments{ 10 | \item{text}{Character scalar to tokenize.} 11 | } 12 | \value{ 13 | Character vector of tokens. 14 | } 15 | \description{ 16 | Run basic whitespace cleaning and splitting on a piece of text. 17 | } 18 | \examples{ 19 | whitespace_tokenize(text = " some\ttext \n with whitespace ") 20 | } 21 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(RBERT) 3 | 4 | test_check("RBERT") 5 | -------------------------------------------------------------------------------- /tests/testthat/attention_probs.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/attention_probs.rds -------------------------------------------------------------------------------- /tests/testthat/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 12, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } 14 | -------------------------------------------------------------------------------- /tests/testthat/sample_amap.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/sample_amap.rds -------------------------------------------------------------------------------- /tests/testthat/sample_feat_in.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/sample_feat_in.rds -------------------------------------------------------------------------------- /tests/testthat/sample_feats.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/sample_feats.rds -------------------------------------------------------------------------------- /tests/testthat/setup.R: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | checkpoint_main_dir <- tempdir() 16 | 17 | # We need the checkpoint to be available for the other tests, so "download" it 18 | # here. We use a mock function for the part that does the actual downloading, 19 | # and instead copy from tests/testthat/test_checkpoints. 20 | 21 | # First we need to check if the user has bert_base_uncased.zip. If they don't, 22 | # they still have to download that one. 23 | 24 | print("Setting up test checkpoint.") 25 | if (!file.exists("test_checkpoints/bert_base_uncased.zip")) { 26 | destfile <- normalizePath( 27 | "test_checkpoints/bert_base_uncased.zip", 28 | mustWork = FALSE 29 | ) 30 | 31 | status <- utils::download.file( 32 | url = "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip", 33 | destfile = destfile, 34 | method = "libcurl" 35 | ) 36 | } 37 | 38 | dont_download_checkpoint <- function(url, checkpoint_zip_path) { 39 | root_dir <- "test_checkpoints" 40 | 41 | from_file <- switch( 42 | url, 43 | "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip" = "bert_base_uncased.zip", 44 | "https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/tensorflow_models/scibert_scivocab_uncased.tar.gz" = "testing_checkpoint.tar.gz" 45 | ) 46 | 47 | from_path <- file.path(root_dir, from_file) 48 | 49 | file.copy( 50 | from = from_path, 51 | to = checkpoint_zip_path, 52 | overwrite = TRUE 53 | ) 54 | 55 | invisible(TRUE) 56 | } 57 | 58 | mockery::stub( 59 | where = download_BERT_checkpoint, 60 | what = ".download_BERT_checkpoint", 61 | how = dont_download_checkpoint 62 | ) 63 | 64 | cpdir <- download_BERT_checkpoint( 65 | model = "bert_base_uncased", 66 | dir = checkpoint_main_dir 67 | ) 68 | -------------------------------------------------------------------------------- /tests/testthat/teardown.R: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | unlink(cpdir, recursive = TRUE) 16 | -------------------------------------------------------------------------------- /tests/testthat/test-download_checkpoint.R: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | test_that("download_checkpoint works", { 17 | # checkpoint was downloaded in setup.R 18 | # Redownloading the checkpoint should occur without incident. 19 | new_cpdir <- download_BERT_checkpoint( 20 | model = "bert_base_uncased", 21 | dir = checkpoint_main_dir 22 | ) 23 | expect_identical(new_cpdir, cpdir) 24 | 25 | testthat::expect_true( 26 | file.exists(file.path(cpdir, "vocab.txt")) 27 | ) 28 | testthat::expect_true( 29 | file.exists(file.path(cpdir, "bert_config.json")) 30 | ) 31 | testthat::expect_true( 32 | file.exists(file.path(cpdir, "bert_model.ckpt.index")) 33 | ) 34 | testthat::expect_true( 35 | file.exists(file.path(cpdir, "bert_model.ckpt.meta")) 36 | ) 37 | testthat::expect_true( 38 | file.exists(file.path(cpdir, "bert_model.ckpt.data-00000-of-00001")) 39 | ) 40 | }) 41 | 42 | test_that("dir chooser works.", { 43 | expect_identical( 44 | .choose_BERT_dir("fake"), "fake" 45 | ) 46 | temp_dir <- tempdir() 47 | testing_dir <- paste0(temp_dir, "/testing") 48 | old_dir <- set_BERT_dir(testing_dir) 49 | expect_identical( 50 | normalizePath(getOption("BERT.dir"), mustWork = FALSE), 51 | normalizePath(testing_dir, mustWork = FALSE) 52 | ) 53 | 54 | # If I don't send it a dir, first it should try the option. 55 | expect_identical( 56 | .choose_BERT_dir(NULL), 57 | normalizePath(testing_dir, mustWork = FALSE) 58 | ) 59 | 60 | # If I don't have an option or a dir, it should use the default. 61 | options(BERT.dir = NULL) 62 | default_dir <- rappdirs::user_cache_dir("RBERT") 63 | expect_identical( 64 | .choose_BERT_dir(NULL), 65 | default_dir 66 | ) 67 | 68 | # Go back to the existing setting. 69 | options(BERT.dir = old_dir$BERT.dir) 70 | 71 | # Get rid of the empty dir. 72 | unlink(normalizePath(testing_dir), recursive = TRUE) 73 | }) 74 | 75 | test_that("Can download a cp by url.", { 76 | # The auto-generated target dir will be different from the one we saved in, so 77 | # move the one we downloaded, attempt to DL without forcing, then move it 78 | # back. That should make sure everything is working as expected. 79 | target_dir <- file.path( 80 | checkpoint_main_dir, "uncased_L-12_H-768_A-12" 81 | ) 82 | 83 | # Explicitly create the dir so file.copy can copy recursively. 84 | dir.create(target_dir) 85 | 86 | file.copy( 87 | cpdir, 88 | target_dir, 89 | recursive = TRUE 90 | ) 91 | 92 | google_base_url <- "https://storage.googleapis.com/bert_models/" 93 | bert_base_uncased_url <- paste0( 94 | google_base_url, 95 | "2018_10_18/uncased_L-12_H-768_A-12.zip" 96 | ) 97 | 98 | expect_warning( 99 | expect_identical( 100 | download_BERT_checkpoint( 101 | url = bert_base_uncased_url 102 | ), 103 | normalizePath(target_dir) 104 | ), 105 | NA 106 | ) 107 | 108 | unlink(target_dir, recursive = TRUE) 109 | 110 | # We also need to test one that has tar-gz. 111 | scibert_url <- paste0( 112 | "https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/", 113 | "tensorflow_models/scibert_scivocab_uncased.tar.gz" 114 | ) 115 | expect_warning( 116 | scibert_path <- download_BERT_checkpoint( 117 | url = scibert_url, dir = checkpoint_main_dir 118 | ), 119 | NA 120 | ) 121 | 122 | unlink(scibert_path, recursive = TRUE) 123 | }) 124 | 125 | test_that(".has_checkpoint works as expected.", { 126 | # We don't use this in the "easy" mode anymore, but I want to keep the extra 127 | # option around (inferring the subdir) until I'm *sure* we don't need it. 128 | expect_error( 129 | expect_true(.has_checkpoint(model = "bert_base_uncased")), 130 | NA 131 | ) 132 | }) 133 | -------------------------------------------------------------------------------- /tests/testthat/test-extract_features.R: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | test_that("features and examples routines work", { 16 | examples <- list( 17 | InputExample_EF( 18 | unique_id = 1, 19 | text_a = "I saw the branch on the bank.", 20 | text_b = "A second sequence of words." 21 | ), 22 | InputExample_EF( 23 | unique_id = 2, 24 | text_a = "I saw the branch of the bank." 25 | ) 26 | ) 27 | # tokenizer <- FullTokenizer("vocab.txt") 28 | # saveRDS(tokenizer, here::here("tests", "testthat", "tokenizer.rds")) 29 | # tokenizer <- readRDS(here::here("tests", "testthat", "tokenizer.rds")) 30 | tokenizer <- readRDS("tokenizer.rds") 31 | feat_in <- .convert_single_example_EF( 32 | ex_index = 6L, 33 | example = examples[[2]], 34 | seq_length = 5L, 35 | tokenizer = tokenizer 36 | ) 37 | expected_feat_in <- readRDS("sample_feat_in.rds") 38 | testthat::expect_identical(feat_in, expected_feat_in) 39 | 40 | 41 | # Run these tests only if checkpoint is found. 42 | BERT_PRETRAINED_DIR <- cpdir # from setup.R 43 | 44 | # Test the ckpt_dir argument here. (Expect no error.) 45 | feats <- extract_features( 46 | examples = examples, 47 | ckpt_dir = BERT_PRETRAINED_DIR, 48 | batch_size = 2L 49 | ) 50 | 51 | testthat::expect_error( 52 | extract_features( 53 | examples = examples, 54 | batch_size = 2L 55 | ), 56 | "ckpt_dir" 57 | ) 58 | 59 | # We should get the same thing if we specify by model instead. 60 | feats2 <- extract_features( 61 | examples = examples, 62 | model = "bert_base_uncased", 63 | batch_size = 2L 64 | ) 65 | 66 | expect_identical(feats2, feats) 67 | rm(feats2) 68 | 69 | # Also make sure it fails if they don't have the model. 70 | expect_error( 71 | extract_features( 72 | examples = examples, 73 | model = "bert_base_cased" 74 | ), 75 | "Specify ckpt_dir" 76 | ) 77 | 78 | vocab_file <- file.path(BERT_PRETRAINED_DIR, "vocab.txt") 79 | init_checkpoint <- file.path(BERT_PRETRAINED_DIR, "bert_model.ckpt") 80 | # Checkpoint "path" is actually only a stub filename; add ".index" to 81 | # check for a specific file. 82 | testthat::skip_if_not(file.exists(paste0( 83 | init_checkpoint, 84 | ".index" 85 | )), 86 | message = "Checkpoint index not found; skipping test." 87 | ) 88 | 89 | bert_config_file <- file.path(BERT_PRETRAINED_DIR, "bert_config.json") 90 | 91 | # Each token should be repeated 4 times (once for each of the 4 layers 92 | # requested by default). I'm sure there's a better way to do this, but this 93 | # works for these sentences. 94 | tokens <- unlist( 95 | c( 96 | "[CLS]", 97 | tolower(stringr::str_extract_all( 98 | examples[[1]]$text_a, 99 | "\\b[^\\s]+\\b", 100 | simplify = TRUE 101 | )), 102 | ".", 103 | "[SEP]", 104 | tolower(stringr::str_extract_all( 105 | examples[[1]]$text_b, 106 | "\\b[^\\s]+\\b", 107 | simplify = TRUE 108 | )), 109 | ".", 110 | "[SEP]", 111 | "[CLS]", 112 | tolower(stringr::str_extract_all( 113 | examples[[1]]$text_a, 114 | "\\b[^\\s]+\\b", 115 | simplify = TRUE 116 | )), 117 | ".", 118 | "[SEP]" 119 | ) 120 | ) 121 | 122 | expect_equal( 123 | sort(unique(feats$output$token)), 124 | sort(unique(tokens)) 125 | ) 126 | 127 | # By default we fetch the last 4 layers. 128 | expect_equal(nrow(feats$output), length(tokens) * 4) 129 | expect_equal( 130 | ncol(feats$output), 131 | 5L + 768L 132 | ) 133 | expect_length(feats, 1) 134 | 135 | # Make sure we can grab layer 0 when we want to. 136 | feats <- extract_features( 137 | examples = examples, 138 | vocab_file = vocab_file, 139 | bert_config_file = bert_config_file, 140 | init_checkpoint = init_checkpoint, 141 | batch_size = 2L, 142 | layer_indexes = -4:0, 143 | features = c("output", "attention") 144 | ) 145 | 146 | expect_length(feats, 2) 147 | 148 | # There may be some minor numerical differences across different systems. Need 149 | # to do a comparison along the lines of dplyr::near. Needed to update these 150 | # tests for the new format, because some of the layer/token index repeats went 151 | # away, and thus the sum changed. I tibbled the expected feats and resaved. 152 | test_feats_flat <- suppressWarnings(as.numeric(unlist(feats$output))) 153 | 154 | # expected_feats <- readRDS( 155 | # here::here("tests", "testthat", "sample_feats.rds") 156 | # ) 157 | expected_feats <- readRDS("sample_feats.rds") 158 | # The sorting changed since I saved an example, so let's put it into the same 159 | # order as the one we're getting now. 160 | expected_feats <- dplyr::arrange( 161 | expected_feats, 162 | sequence_index, 163 | layer_index, 164 | token_index 165 | ) 166 | expected_feats_flat <- suppressWarnings(as.numeric(unlist(expected_feats))) 167 | 168 | tol <- 10^(-5) 169 | 170 | # check both the sum and mean relative difference 171 | rel_diff_sum <- abs(sum(test_feats_flat, na.rm = TRUE) - 172 | sum(expected_feats_flat, na.rm = TRUE)) / 173 | (tol + abs(sum(test_feats_flat, na.rm = TRUE) + 174 | sum(expected_feats_flat, na.rm = TRUE))) 175 | testthat::expect_lte(rel_diff_sum, tol) 176 | 177 | mean_relative_difference <- mean(abs(test_feats_flat - expected_feats_flat) / 178 | (tol + abs(test_feats_flat + 179 | expected_feats_flat)), 180 | na.rm = TRUE 181 | ) 182 | 183 | testthat::expect_lte(mean_relative_difference, tol) 184 | 185 | test_attn_flat <- suppressWarnings( 186 | as.numeric(unlist(feats$attention$attention_weight)) 187 | ) 188 | 189 | # expected_attn <- readRDS( 190 | # here::here("tests", "testthat", "attention_probs.rds") 191 | # ) 192 | expected_attn <- readRDS("attention_probs.rds") 193 | expected_attn_flat <- suppressWarnings(as.numeric(unlist(expected_attn))) 194 | expected_attn_flat <- expected_attn_flat[!is.na(expected_attn_flat)] 195 | 196 | # The original expected value has rotated matrices relative to the tidy 197 | # tibble. However, it's unlikely that they'd work out to have sum and mean 198 | # below within tolerance if they were actually different, so I'm sorting to 199 | # get a "good enough" evaluation. 200 | expected_attn_flat <- sort(expected_attn_flat) 201 | test_attn_flat <- sort(test_attn_flat) 202 | 203 | rel_diff_sum <- abs(sum(test_attn_flat, na.rm = TRUE) - 204 | sum(expected_attn_flat, na.rm = TRUE)) / 205 | (tol + abs(sum(test_attn_flat, na.rm = TRUE) + 206 | sum(expected_attn_flat, na.rm = TRUE))) 207 | testthat::expect_lte(rel_diff_sum, tol) 208 | 209 | mean_relative_difference <- mean(abs(test_attn_flat - expected_attn_flat) / 210 | (tol + abs(test_attn_flat + 211 | expected_attn_flat)), 212 | na.rm = TRUE 213 | ) 214 | 215 | testthat::expect_lte(mean_relative_difference, tol) 216 | 217 | feats <- extract_features( 218 | examples = examples, 219 | vocab_file = vocab_file, 220 | bert_config_file = bert_config_file, 221 | init_checkpoint = init_checkpoint, 222 | batch_size = 2L, 223 | features = "output" 224 | ) 225 | expect_length(feats, 1) 226 | expect_is(feats$output, "tbl_df") 227 | expect_equal( 228 | colnames(feats$output), 229 | c( 230 | "sequence_index", "segment_index", "token_index", "token", "layer_index", 231 | paste0("V", 1:768) 232 | ) 233 | ) 234 | 235 | feats <- extract_features( 236 | examples = examples, 237 | vocab_file = vocab_file, 238 | bert_config_file = bert_config_file, 239 | init_checkpoint = init_checkpoint, 240 | batch_size = 2L, 241 | features = "attention" 242 | ) 243 | expect_length(feats, 1) 244 | expect_is(feats$attention, "tbl_df") 245 | expect_equal( 246 | colnames(feats$attention), 247 | c( 248 | "sequence_index", "token_index", "segment_index", "token", 249 | "layer_index", "head_index", "attention_token_index", 250 | "attention_segment_index", "attention_token", "attention_weight" 251 | ) 252 | ) 253 | 254 | # works for examples given as character vectors 255 | text_example1 <- "one" 256 | text_example2 <- list(c("one", "two"), c("three", "four")) 257 | text_example3 <- list(list("one", "two"), list("three", "four")) 258 | lone_example <- make_examples_simple(text_example1)[[1]] 259 | 260 | feats1 <- extract_features( 261 | examples = text_example1, 262 | model= "bert_base_uncased" 263 | ) 264 | testthat::expect_equal(dim(feats1$output), c(12L, 773L)) 265 | 266 | feats1b <- extract_features( 267 | examples = lone_example, 268 | model= "bert_base_uncased" 269 | ) 270 | testthat::expect_identical(feats1, feats1b) 271 | 272 | feats2 <- extract_features( 273 | examples = text_example2, 274 | model= "bert_base_uncased" 275 | ) 276 | testthat::expect_equal(dim(feats2$output), c(40L, 773L)) 277 | 278 | feats3 <- extract_features( 279 | examples = text_example3, 280 | model= "bert_base_uncased" 281 | ) 282 | testthat::expect_identical(feats2, feats3) 283 | 284 | # Manual speed tests: 285 | # emma_lines <- janeaustenr::emma[janeaustenr::emma != ""][5:54] 286 | # examples <- purrr::imap( 287 | # emma_lines, 288 | # ~ InputExample_EF(unique_id = .y, text_a = .x) 289 | # ) 290 | # microbenchmark::microbenchmark( 291 | # feats <- extract_features( 292 | # examples = examples, 293 | # vocab_file = vocab_file, 294 | # bert_config_file = bert_config_file, 295 | # init_checkpoint = init_checkpoint, 296 | # batch_size = 2L, 297 | # features = "attention" 298 | # ), 299 | # times = 1 300 | # ) 301 | }) 302 | 303 | test_that(".get_actual_index works", { 304 | testthat::expect_error( 305 | .get_actual_index(index = 0, length = 10), 306 | "Ambiguous" 307 | ) 308 | 309 | testthat::expect_error( 310 | .get_actual_index(index = 11, length = 10), 311 | "out of range" 312 | ) 313 | 314 | testthat::expect_identical(.get_actual_index(index = -2, length = 10), 9L) 315 | 316 | testthat::expect_identical(.get_actual_index(index = 9, length = 10), 9L) 317 | }) 318 | 319 | test_that("make_examples_simple works", { 320 | text <- c( 321 | "Here are some words.", 322 | "Here are some more words." 323 | ) 324 | input_ex <- make_examples_simple(text) 325 | testthat::expect_s3_class(input_ex[[1]], "InputExample_EF") 326 | 327 | testthat::expect_identical(input_ex[[1]]$text_a, text[[1]]) 328 | testthat::expect_null(input_ex[[1]]$text_b) 329 | testthat::expect_identical(input_ex[[2]]$text_a, text[[2]]) 330 | testthat::expect_null(input_ex[[2]]$text_b) 331 | }) 332 | 333 | test_that("make_examples_simple works for two-segment examples", { 334 | text <- list( 335 | c( 336 | "First sequence, first segment.", 337 | "First sequence, second segment." 338 | ), 339 | c( 340 | "Second sequence, first segment.", 341 | "Second sequence, second segment.", 342 | "Second sequence, EXTRA segment." 343 | ), 344 | "Third sequence, only one segment." 345 | ) 346 | testthat::expect_warning( 347 | input_ex <- make_examples_simple(text), 348 | "ignored" 349 | ) 350 | testthat::expect_identical(input_ex[[1]]$text_a, text[[1]][[1]]) 351 | testthat::expect_identical(input_ex[[1]]$text_b, text[[1]][[2]]) 352 | testthat::expect_identical(input_ex[[2]]$text_a, text[[2]][[1]]) 353 | testthat::expect_identical(input_ex[[2]]$text_b, text[[2]][[2]]) 354 | testthat::expect_identical(input_ex[[3]]$text_a, text[[3]]) 355 | testthat::expect_null(input_ex[[3]]$text_b) 356 | }) 357 | -------------------------------------------------------------------------------- /tests/testthat/test-modeling.R: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | test_that("The BertConfig routines work", { 17 | config <- BertConfig(vocab_size = 30522L) 18 | expected_config <- list( 19 | "vocab_size" = 30522L, 20 | "hidden_size" = 768L, 21 | "num_hidden_layers" = 12L, 22 | "num_attention_heads" = 12L, 23 | "hidden_act" = "gelu", 24 | "intermediate_size" = 3072L, 25 | "hidden_dropout_prob" = 0.1, 26 | "attention_probs_dropout_prob" = 0.1, 27 | "max_position_embeddings" = 512L, 28 | "type_vocab_size" = 16L, 29 | "initializer_range" = 0.02 30 | ) 31 | testthat::expect_is(config, "BertConfig") 32 | testthat::expect_identical(names(config), names(expected_config)) 33 | 34 | json_file <- "bert_config.json" 35 | json_config <- bert_config_from_json_file(json_file) 36 | 37 | testthat::expect_is(json_config, "BertConfig") 38 | testthat::expect_identical(names(json_config), names(expected_config)) 39 | }) 40 | 41 | test_that("The BertModel routine works", { 42 | with(tensorflow::tf$variable_scope("tests", 43 | reuse = tensorflow::tf$AUTO_REUSE 44 | ), { 45 | input_ids <- tensorflow::tf$constant(list( 46 | list(31L, 51L, 99L), 47 | list(15L, 5L, 0L) 48 | )) 49 | 50 | input_mask <- tensorflow::tf$constant(list( 51 | list(1L, 1L, 1L), 52 | list(1L, 1L, 0L) 53 | )) 54 | token_type_ids <- tensorflow::tf$constant(list( 55 | list(0L, 0L, 1L), 56 | list(0L, 2L, 0L) 57 | )) 58 | config <- BertConfig( 59 | vocab_size = 32000L, 60 | hidden_size = 768L, 61 | num_hidden_layers = 8L, 62 | num_attention_heads = 12L, 63 | intermediate_size = 1024L 64 | ) 65 | model_train <- BertModel( 66 | config = config, 67 | is_training = TRUE, 68 | input_ids = input_ids, 69 | input_mask = input_mask, 70 | token_type_ids = token_type_ids 71 | ) 72 | model <- BertModel( 73 | config = config, 74 | is_training = FALSE, 75 | input_ids = input_ids, 76 | input_mask = NULL, 77 | token_type_ids = NULL 78 | ) 79 | }) 80 | testthat::expect_is(model, "BertModel") 81 | testthat::expect_is( 82 | model$embedding_output, 83 | "tensorflow.python.framework.ops.Tensor" 84 | ) 85 | testthat::expect_is( 86 | model$embedding_table, 87 | "tensorflow.python.ops.variables.RefVariable" 88 | ) 89 | testthat::expect_is( 90 | model$sequence_output, 91 | "tensorflow.python.framework.ops.Tensor" 92 | ) 93 | testthat::expect_is( 94 | model$pooled_output, 95 | "tensorflow.python.framework.ops.Tensor" 96 | ) 97 | testthat::expect_is( 98 | model$all_encoder_layers[[1]], 99 | "tensorflow.python.framework.ops.Tensor" 100 | ) 101 | 102 | # dropout should only be applied in training! 103 | testthat::expect_true(grepl( 104 | pattern = "dropout", 105 | model_train$embedding_output$op$name 106 | )) 107 | testthat::expect_false(grepl( 108 | pattern = "dropout", 109 | model$embedding_output$op$name 110 | )) 111 | }) 112 | 113 | 114 | test_that("gelu works", { 115 | with( 116 | tensorflow::tf$variable_scope("tests", 117 | reuse = tensorflow::tf$AUTO_REUSE 118 | ), 119 | tfx <- tensorflow::tf$get_variable("tfx", tensorflow::shape(10L)) 120 | ) 121 | tgelu <- gelu(tfx) 122 | testthat::expect_is(tgelu, "tensorflow.python.framework.ops.Tensor") 123 | testthat::expect_identical(tgelu$shape$as_list(), 10L) 124 | }) 125 | 126 | test_that("get_activation works", { 127 | testthat::expect_identical(get_activation("gelu"), gelu) 128 | testthat::expect_equal( 129 | get_activation("relu"), 130 | tensorflow::tf$nn$relu 131 | ) 132 | testthat::expect_equal( 133 | get_activation("tanh"), 134 | tensorflow::tf$tanh 135 | ) 136 | testthat::expect_true(is.na(get_activation("linear"))) 137 | }) 138 | 139 | test_that("get_assignment_map_from_checkpoint works", { 140 | # Create a "model" with a couple variables that overlap some variable names in 141 | # the BERT checkpoint. (The actual variables aren't compatible with the 142 | # checkpoint.) The BERT checkpoint is large, and won't be included in repo. A 143 | # checkpoint is downloaded as part of test setup. Run this test only if the 144 | # checkpoint can be found. 145 | 146 | init_checkpoint <- file.path( 147 | cpdir, 148 | "bert_model.ckpt" 149 | ) 150 | 151 | # Checkpoint "path" is actually only a stub filename; add ".index" to 152 | # check for a specific file. 153 | testthat::skip_if_not(file.exists(paste0( 154 | init_checkpoint, 155 | ".index" 156 | )), 157 | message = "Checkpoint index not found; skipping test." 158 | ) 159 | 160 | with(tensorflow::tf$variable_scope("bert", 161 | reuse = tensorflow::tf$AUTO_REUSE 162 | ), { 163 | test_ten1 <- tensorflow::tf$get_variable( 164 | "encoder/layer_9/output/dense/bias", 165 | shape = c(1L, 2L, 3L) 166 | ) 167 | test_ten2 <- tensorflow::tf$get_variable( 168 | "encoder/layer_9/output/dense/kernel", 169 | shape = c(1L, 2L, 3L) 170 | ) 171 | }) 172 | tvars <- tensorflow::tf$get_collection( 173 | tensorflow::tf$GraphKeys$GLOBAL_VARIABLES 174 | ) 175 | 176 | amap <- get_assignment_map_from_checkpoint(tvars, init_checkpoint) 177 | expected_result <- readRDS("sample_amap.rds") 178 | testthat::expect_identical(amap, expected_result) 179 | }) 180 | 181 | 182 | test_that("dropout works", { 183 | with( 184 | tensorflow::tf$variable_scope("tests", 185 | reuse = tensorflow::tf$AUTO_REUSE 186 | ), 187 | todrop <- tensorflow::tf$get_variable( 188 | "todrop", 189 | tensorflow::shape(10L, 20L) 190 | ) 191 | ) 192 | dropped <- dropout(todrop, 0.3) 193 | testthat::expect_is(dropped, "tensorflow.python.framework.ops.Tensor") 194 | testthat::expect_true(grepl(pattern = "dropout", dropped$op$name)) 195 | }) 196 | 197 | test_that("layer_norm works", { 198 | with( 199 | tensorflow::tf$variable_scope("tests", 200 | reuse = tensorflow::tf$AUTO_REUSE 201 | ), 202 | lnorm <- tensorflow::tf$get_variable("lnorm", tensorflow::shape(10L)) 203 | ) 204 | normed <- layer_norm(lnorm) 205 | testthat::expect_is(normed, "tensorflow.python.framework.ops.Tensor") 206 | testthat::expect_true(grepl(pattern = "LayerNorm", normed$op$name)) 207 | }) 208 | 209 | test_that("layer_norm_and_dropout works", { 210 | with( 211 | tensorflow::tf$variable_scope("tests", 212 | reuse = tensorflow::tf$AUTO_REUSE 213 | ), 214 | lndr <- tensorflow::tf$get_variable("lndr", tensorflow::shape(10L)) 215 | ) 216 | normed_and_dropped <- layer_norm_and_dropout(lndr, dropout_prob = 0.5) 217 | testthat::expect_is( 218 | normed_and_dropped, 219 | "tensorflow.python.framework.ops.Tensor" 220 | ) 221 | testthat::expect_true(grepl(pattern = "dropout", normed_and_dropped$op$name)) 222 | }) 223 | 224 | test_that("create_initializer works", { 225 | init <- create_initializer() 226 | testthat::expect_is(init, "tensorflow.python.ops.init_ops.TruncatedNormal") 227 | }) 228 | 229 | test_that("embedding_lookup works", { 230 | with(tensorflow::tf$variable_scope("tests", 231 | reuse = tensorflow::tf$AUTO_REUSE 232 | ), { 233 | ids <- tensorflow::tf$get_variable("ids", 234 | dtype = "int32", 235 | shape = tensorflow::shape(10, 20) 236 | ) 237 | el <- embedding_lookup(ids, 238 | vocab_size = 100L, 239 | word_embedding_name = "some_name" 240 | ) 241 | }) 242 | testthat::expect_is(el[[1]], "tensorflow.python.framework.ops.Tensor") 243 | testthat::expect_is(el[[2]], "tensorflow.python.ops.variables.RefVariable") 244 | }) 245 | 246 | test_that("embedding_postprocessor works", { 247 | batch_size <- 10 248 | seq_length <- 512 249 | embedding_size <- 200 250 | with(tensorflow::tf$variable_scope("tests", 251 | reuse = tensorflow::tf$AUTO_REUSE 252 | ), { 253 | input_tensor <- tensorflow::tf$get_variable( 254 | "input_epp", 255 | dtype = "float", 256 | shape = tensorflow::shape(batch_size, seq_length, embedding_size) 257 | ) 258 | token_type_ids <- tensorflow::tf$get_variable( 259 | "ids_epp", 260 | dtype = "int32", 261 | shape = tensorflow::shape(batch_size, seq_length) 262 | ) 263 | 264 | pp_embed <- embedding_postprocessor(input_tensor, 265 | use_token_type = TRUE, 266 | token_type_ids = token_type_ids 267 | ) 268 | }) 269 | testthat::expect_is(pp_embed, "tensorflow.python.framework.ops.Tensor") 270 | testthat::expect_true(grepl(pattern = "dropout", pp_embed$op$name)) 271 | }) 272 | 273 | test_that("create_attention_mask_from_input_mask works", { 274 | with(tensorflow::tf$variable_scope("tests", 275 | reuse = tensorflow::tf$AUTO_REUSE 276 | ), { 277 | from_tensor <- ids <- tensorflow::tf$get_variable( 278 | "ften", 279 | dtype = "float", 280 | shape = tensorflow::shape(10, 20) 281 | ) 282 | to_mask <- ids <- tensorflow::tf$get_variable( 283 | "mask", 284 | dtype = "int32", 285 | shape = tensorflow::shape(10, 30) 286 | ) 287 | amask <- create_attention_mask_from_input_mask(from_tensor, to_mask) 288 | }) 289 | testthat::expect_is(amask, "tensorflow.python.framework.ops.Tensor") 290 | testthat::expect_identical(amask$shape$as_list(), c(10L, 20L, 30L)) 291 | }) 292 | 293 | test_that("transformer_model works", { 294 | batch_size <- 10 295 | seq_length <- 500 296 | hidden_size <- 120 297 | num_hidden <- 7 298 | 299 | with(tensorflow::tf$variable_scope("tests", 300 | reuse = tensorflow::tf$AUTO_REUSE 301 | ), { 302 | input_tensor <- tensorflow::tf$get_variable("input_tm", 303 | shape = c( 304 | batch_size, 305 | seq_length, 306 | hidden_size 307 | ) 308 | ) 309 | model_t <- transformer_model( 310 | input_tensor = input_tensor, 311 | hidden_size = hidden_size, 312 | num_hidden_layers = num_hidden, 313 | do_return_all_layers = TRUE 314 | ) 315 | }) 316 | # ATTN: modified below to account for attention_data 317 | attention_data <- model_t$attention_data 318 | testthat::expect_equal(length(attention_data), num_hidden) 319 | testthat::expect_is( 320 | attention_data[[num_hidden]], 321 | "tensorflow.python.framework.ops.Tensor" 322 | ) 323 | model_t <- model_t$final_outputs 324 | # ATTN: modified above to account for attention_data 325 | 326 | testthat::expect_equal(length(model_t), num_hidden) 327 | testthat::expect_is( 328 | model_t[[num_hidden]], 329 | "tensorflow.python.framework.ops.Tensor" 330 | ) 331 | }) 332 | 333 | 334 | test_that("get_shape_list works", { 335 | with(tensorflow::tf$variable_scope("tests", 336 | reuse = tensorflow::tf$AUTO_REUSE 337 | ), { 338 | phold <- tensorflow::tf$placeholder(tensorflow::tf$int32, 339 | shape = tensorflow::shape(4) 340 | ) 341 | static_shape <- get_shape_list(phold) 342 | tfunique <- tensorflow::tf$unique(phold) 343 | tfy <- tfunique$y 344 | dynamic_shape <- get_shape_list(tfy) 345 | }) 346 | testthat::expect_identical(static_shape, list(4L)) 347 | testthat::expect_is( 348 | dynamic_shape[[1]], 349 | "tensorflow.python.framework.ops.Tensor" 350 | ) 351 | }) 352 | 353 | test_that("reshape to/from matrix functions work", { 354 | with( 355 | tensorflow::tf$variable_scope("tests", 356 | reuse = tensorflow::tf$AUTO_REUSE 357 | ), 358 | r3t <- tensorflow::tf$get_variable("r3t", 359 | dtype = "int32", 360 | shape = tensorflow::shape(10, 20, 3) 361 | ) 362 | ) 363 | mat <- reshape_to_matrix(r3t) 364 | testthat::expect_is(mat, "tensorflow.python.framework.ops.Tensor") 365 | testthat::expect_identical(mat$shape$as_list(), c(200L, 3L)) 366 | 367 | ten3 <- reshape_from_matrix(mat, orig_shape_list = list(10L, 20L, 3L)) 368 | testthat::expect_is(ten3, "tensorflow.python.framework.ops.Tensor") 369 | testthat::expect_identical(ten3$shape$as_list(), c(10L, 20L, 3L)) 370 | }) 371 | 372 | test_that("assert_rank works", { 373 | with(tensorflow::tf$variable_scope("tests", 374 | reuse = tensorflow::tf$AUTO_REUSE 375 | ), { 376 | ten <- tensorflow::tf$get_variable("ten", 377 | dtype = "int32", 378 | shape = tensorflow::shape(10) 379 | ) 380 | testthat::expect_true(assert_rank(ten, 1)) 381 | testthat::expect_true(assert_rank(ten, 1:2)) 382 | testthat::expect_error(assert_rank(ten, 2), "not equal") 383 | }) 384 | }) 385 | -------------------------------------------------------------------------------- /tests/testthat/test-optimization.R: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | test_that("create_optimizer works", { 17 | with(tensorflow::tf$variable_scope("tests", 18 | reuse = tensorflow::tf$AUTO_REUSE 19 | ), { 20 | totrain <- tensorflow::tf$get_variable( 21 | "totrain", 22 | tensorflow::shape(10L, 20L) 23 | ) 24 | loss <- 2 * totrain 25 | 26 | t_op <- create_optimizer( 27 | loss = loss, 28 | init_lr = 0.01, 29 | num_train_steps = 20L, 30 | num_warmup_steps = 10L, 31 | use_tpu = FALSE 32 | ) 33 | }) 34 | 35 | testthat::expect_is( 36 | t_op, 37 | "tensorflow.python.framework.ops.Operation" 38 | ) 39 | 40 | testthat::expect_true(grepl(pattern = "group_deps", t_op$name)) 41 | 42 | # now actually put some training variables in place... 43 | with( 44 | tensorflow::tf$variable_scope("tests", 45 | reuse = tensorflow::tf$AUTO_REUSE 46 | ), 47 | totrain <- tensorflow::tf$get_variable( 48 | "totrain", 49 | tensorflow::shape(10L, 20L) 50 | ) 51 | ) 52 | }) 53 | 54 | 55 | test_that("AdamWeightDecayOptimizer works", { 56 | with(tensorflow::tf$variable_scope("tests", 57 | reuse = tensorflow::tf$AUTO_REUSE 58 | ), { 59 | awd_opt <- AdamWeightDecayOptimizer(learning_rate = 0.01) 60 | }) 61 | 62 | testthat::expect_is( 63 | awd_opt, 64 | "AdamWeightDecayOptimizer" 65 | ) 66 | testthat::expect_is( 67 | awd_opt, 68 | "tensorflow.python.training.optimizer.Optimizer" 69 | ) 70 | # after our hack, `apply_gradients` is a function, not a method. 71 | testthat::expect_is( 72 | awd_opt$apply_gradients, 73 | "python.builtin.function" 74 | ) 75 | }) 76 | -------------------------------------------------------------------------------- /tests/testthat/test-run_classifier.R: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | test_that("truncate_seq_pair works", { 17 | tokens_a <- c("a", "b", "c", "d") 18 | tokens_b <- c("w", "x", "y", "z") 19 | trunc_seq <- truncate_seq_pair(tokens_a, tokens_b, 5) 20 | expected_result <- list( 21 | trunc_a = c("a", "b", "c"), 22 | trunc_b = c("w", "x") 23 | ) 24 | testthat::expect_identical(trunc_seq, expected_result) 25 | }) 26 | 27 | test_that("create_model works", { 28 | with(tensorflow::tf$variable_scope("tests_class1", 29 | reuse = tensorflow::tf$AUTO_REUSE 30 | ), { 31 | input_ids <- tensorflow::tf$constant(list( 32 | list(31L, 51L, 99L), 33 | list(15L, 5L, 0L) 34 | )) 35 | 36 | input_mask <- tensorflow::tf$constant(list( 37 | list(1L, 1L, 1L), 38 | list(1L, 1L, 0L) 39 | )) 40 | token_type_ids <- tensorflow::tf$constant(list( 41 | list(0L, 0L, 1L), 42 | list(0L, 2L, 0L) 43 | )) 44 | config <- BertConfig( 45 | vocab_size = 32000L, 46 | hidden_size = 768L, 47 | num_hidden_layers = 8L, 48 | type_vocab_size = 2L, 49 | num_attention_heads = 12L, 50 | intermediate_size = 1024L 51 | ) 52 | class_model <- create_model( 53 | bert_config = config, 54 | is_training = TRUE, 55 | input_ids = input_ids, 56 | input_mask = input_mask, 57 | segment_ids = token_type_ids, 58 | labels = c(1L, 2L), 59 | num_labels = 2L 60 | ) 61 | }) 62 | testthat::expect_is( 63 | class_model$loss, 64 | "tensorflow.python.framework.ops.Tensor" 65 | ) 66 | testthat::expect_is( 67 | class_model$per_example_loss, 68 | "tensorflow.python.framework.ops.Tensor" 69 | ) 70 | testthat::expect_is( 71 | class_model$logits, 72 | "tensorflow.python.framework.ops.Tensor" 73 | ) 74 | testthat::expect_is( 75 | class_model$probabilities, 76 | "tensorflow.python.framework.ops.Tensor" 77 | ) 78 | 79 | testthat::expect_true(grepl( 80 | pattern = "Mean", 81 | class_model$loss$op$name 82 | )) 83 | testthat::expect_true(grepl( 84 | pattern = "Neg", 85 | class_model$per_example_loss$op$name 86 | )) 87 | testthat::expect_true(grepl( 88 | pattern = "BiasAdd", 89 | class_model$logits$op$name 90 | )) 91 | testthat::expect_true(grepl( 92 | pattern = "Softmax", 93 | class_model$probabilities$op$name 94 | )) 95 | }) 96 | 97 | test_that("model_fn_builder works", { 98 | # Run this test only if the checkpoint can be found. 99 | init_checkpoint <- file.path( 100 | cpdir, # from setup.R 101 | "bert_model.ckpt" 102 | ) 103 | 104 | # Checkpoint "path" is actually only a stub filename; add ".index" to 105 | # check for a specific file. 106 | testthat::skip_if_not(file.exists(paste0( 107 | init_checkpoint, 108 | ".index" 109 | )), 110 | message = "Checkpoint index not found; skipping test." 111 | ) 112 | with(tensorflow::tf$variable_scope("tests_class2", 113 | reuse = tensorflow::tf$AUTO_REUSE 114 | ), { 115 | input_ids <- tensorflow::tf$constant(list( 116 | list(31L, 51L, 99L), 117 | list(15L, 5L, 0L) 118 | )) 119 | 120 | input_mask <- tensorflow::tf$constant(list( 121 | list(1L, 1L, 1L), 122 | list(1L, 1L, 0L) 123 | )) 124 | token_type_ids <- tensorflow::tf$constant(list( 125 | list(0L, 0L, 1L), 126 | list(0L, 2L, 0L) 127 | )) 128 | config <- BertConfig( 129 | vocab_size = 30522L, 130 | hidden_size = 768L, 131 | num_hidden_layers = 8L, 132 | type_vocab_size = 2L, 133 | num_attention_heads = 12L, 134 | intermediate_size = 3072L 135 | ) 136 | 137 | test_mod_fn <- model_fn_builder( 138 | bert_config = config, 139 | num_labels = 2L, 140 | init_checkpoint = init_checkpoint, 141 | learning_rate = 0.01, 142 | num_train_steps = 20L, 143 | num_warmup_steps = 10L, 144 | use_tpu = FALSE 145 | ) 146 | # After we implement InputFeatures class, come back and add tests for 147 | # `test_mod_fn`. Something like this, but better: 148 | # features <- list() 149 | # features$input_ids <- input_ids 150 | # features$input_mask <- input_mask 151 | # features$segment_ids <- token_type_ids 152 | # features$label_ids <- c(1L, 2L) 153 | 154 | # mod_fn_output <- test_mod_fn(features = features, 155 | # labels = NULL, 156 | # mode = "train", 157 | # params = NULL) 158 | }) 159 | # This isn't much of a test, but it does confirm that the maker function 160 | # ran, which is non-trivial. 161 | testthat::expect_is(test_mod_fn, "function") 162 | }) 163 | 164 | test_that("Examples/features creation routines work", { 165 | tokenizer <- FullTokenizer("vocab.txt") 166 | input_ex1 <- InputExample( 167 | guid = 1L, 168 | text_a = "Some text to classify.", 169 | text_b = "More wordy words.", 170 | label = "good" 171 | ) 172 | 173 | testthat::expect_is(input_ex1, "InputExample") 174 | testthat::expect_identical( 175 | names(input_ex1), 176 | c("guid", "text_a", "text_b", "label") 177 | ) 178 | input_ex2 <- InputExample( 179 | guid = 2L, 180 | text_a = "This is another example.", 181 | text_b = "So many words.", 182 | label = "bad" 183 | ) 184 | feat <- convert_examples_to_features( 185 | examples = list(input_ex1, input_ex2), 186 | label_list = c("good", "bad"), 187 | max_seq_length = 15L, 188 | tokenizer = tokenizer 189 | ) 190 | testthat::expect_identical(length(feat), 2L) 191 | testthat::expect_is(feat[[1]], "InputFeatures") 192 | testthat::expect_identical( 193 | names(feat[[1]]), 194 | c( 195 | "input_ids", 196 | "input_mask", 197 | "segment_ids", 198 | "label_id", 199 | "is_real_example" 200 | ) 201 | ) 202 | }) 203 | -------------------------------------------------------------------------------- /tests/testthat/test-tokenization.R: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | test_that("the convert token/id functions work", { 17 | vocab <- c("token1" = 0, "token2" = 1, "token3" = 2) 18 | inv_voc <- names(vocab) 19 | names(inv_voc) <- vocab 20 | 21 | test_result <- convert_tokens_to_ids(vocab, c("token1", "token3")) 22 | expected_result <- c("token1" = 0, "token3" = 2) 23 | testthat::expect_identical(test_result, expected_result) 24 | 25 | test_result <- convert_ids_to_tokens(inv_voc, c(1, 3)) 26 | expected_result <- c("0" = "token1", "2" = "token3") 27 | testthat::expect_identical(test_result, expected_result) 28 | }) 29 | 30 | test_that("The FullTokenizer tokenizer works as expected", { 31 | f_tokenizer <- FullTokenizer("vocab.txt", TRUE) 32 | text <- "\u535A\u00E7anned words; tihs is Silly" 33 | test_result <- tokenize(f_tokenizer, text) 34 | expected_result <- c( 35 | "\u535A", "canned", "words", ";", 36 | "ti", "##hs", "is", "silly" 37 | ) 38 | testthat::expect_identical(test_result, expected_result) 39 | 40 | f_tokenizer <- FullTokenizer("vocab_small.txt", TRUE) 41 | text <- "know the unknowable!" 42 | test_result <- tokenize(f_tokenizer, text) 43 | expected_result <- c( 44 | "know", "the", "un", "##know", 45 | "##able", "[UNK]" 46 | ) 47 | testthat::expect_identical(test_result, expected_result) 48 | }) 49 | 50 | 51 | test_that("Tokenizers handle edge cases correctly", { 52 | test_string <- "remove char: \ufffd " 53 | vocab <- load_vocab(vocab_file = "vocab.txt") 54 | 55 | b_tokenizer <- BasicTokenizer(TRUE) 56 | test_result <- tokenize(b_tokenizer, text = test_string) 57 | expected_result <- c("remove", "char", ":") 58 | testthat::expect_identical(test_result, expected_result) 59 | 60 | wp_tokenizer <- WordpieceTokenizer(vocab, max_input_chars_per_word = 4) 61 | test_result <- tokenize(wp_tokenizer, text = "excessively long") 62 | expected_result <- c("[UNK]", "long") 63 | testthat::expect_identical(test_result, expected_result) 64 | 65 | expect_identical( 66 | load_vocab("vocab0.txt"), 67 | integer(0) 68 | ) 69 | }) 70 | 71 | 72 | test_that("whitespace_tokenize splits a string on whitespace", { 73 | test_string <- " some\ttext\nwith whitespace " 74 | test_result <- whitespace_tokenize(test_string) 75 | expected_result <- c("some", "text", "with", "whitespace") 76 | testthat::expect_identical(test_result, expected_result) 77 | }) 78 | 79 | 80 | 81 | test_that("strip_accents replaces accented chars with nearest equivalents", { 82 | test_string <- "fa\u00E7ile" 83 | test_result <- strip_accents(test_string) 84 | expected_result <- "facile" 85 | testthat::expect_identical(test_result, expected_result) 86 | }) 87 | 88 | 89 | test_that("split_on_punc splits a string before and after punctuation chars", { 90 | test_string <- "stop! don't touch that." 91 | test_result <- split_on_punc(test_string) 92 | expected_result <- c("stop", "!", " don", "'", "t touch that", ".") 93 | testthat::expect_identical(test_result, expected_result) 94 | 95 | test_string <- "!" 96 | test_result <- split_on_punc(test_string) 97 | expected_result <- c("!") 98 | testthat::expect_identical(test_result, expected_result) 99 | }) 100 | 101 | 102 | 103 | 104 | test_that("is_whitespace correctly classifies characters", { 105 | # tests from BERT: tokenization_test.py 106 | testthat::expect_true(is_whitespace(" ")) 107 | testthat::expect_true(is_whitespace("\t")) 108 | testthat::expect_true(is_whitespace("\r")) 109 | testthat::expect_true(is_whitespace("\n")) 110 | testthat::expect_true(is_whitespace("\u00A0")) # non-breaking space 111 | 112 | testthat::expect_false(is_whitespace("A")) 113 | testthat::expect_false(is_whitespace("-")) 114 | }) 115 | 116 | 117 | test_that("is_control correctly classifies characters", { 118 | # tests from BERT: tokenization_test.py 119 | testthat::expect_true(is_control("\u0005")) # 'Enquiry' control character 120 | 121 | testthat::expect_false(is_control("A")) 122 | testthat::expect_false(is_control(" ")) 123 | testthat::expect_false(is_control("\t")) 124 | testthat::expect_false(is_control("\r")) 125 | }) 126 | 127 | 128 | 129 | test_that("is_punctuation correctly classifies characters", { 130 | # tests from BERT: tokenization_test.py 131 | testthat::expect_true(is_punctuation("-")) 132 | testthat::expect_true(is_punctuation("$")) 133 | testthat::expect_true(is_punctuation("`")) 134 | testthat::expect_true(is_punctuation(".")) 135 | 136 | testthat::expect_false(is_punctuation("A")) 137 | testthat::expect_false(is_punctuation(" ")) 138 | }) 139 | 140 | 141 | test_that("tokenize_text works correctly", { 142 | text <- c("Who doesn't like tacos?", "Not me!") 143 | tokens <- tokenize_text(text = text, ckpt_dir = cpdir) 144 | testthat::expect_identical(length(tokens[[1]]), 10L) 145 | testthat::expect_identical(length(tokens[[2]]), 5L) 146 | }) 147 | 148 | test_that("check_vocab works correctly", { 149 | to_check <- c("apple", "appl") 150 | vcheck <- check_vocab(words = to_check, ckpt_dir = cpdir) 151 | testthat::expect_identical(vcheck, c(TRUE, FALSE)) 152 | }) 153 | -------------------------------------------------------------------------------- /tests/testthat/test_checkpoints/.gitignore: -------------------------------------------------------------------------------- 1 | bert_base_uncased.zip 2 | -------------------------------------------------------------------------------- /tests/testthat/test_checkpoints/testing_checkpoint.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/test_checkpoints/testing_checkpoint.tar.gz -------------------------------------------------------------------------------- /tests/testthat/tokenizer.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/tokenizer.rds -------------------------------------------------------------------------------- /tests/testthat/vocab0.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/vocab0.txt -------------------------------------------------------------------------------- /tests/testthat/vocab_small.txt: -------------------------------------------------------------------------------- 1 | [PAD] 2 | un 3 | ##know 4 | ##able 5 | the 6 | to 7 | know 8 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/BERT_basics.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "BERT Basics" 3 | output: 4 | rmarkdown::html_vignette: 5 | toc: true 6 | vignette: > 7 | %\VignetteIndexEntry{BERT Basics} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | 27 | 28 | ```{r, include = FALSE} 29 | knitr::opts_chunk$set( 30 | collapse = TRUE, 31 | comment = "#>" 32 | ) 33 | ``` 34 | 35 | 36 | 37 | 38 | 39 | 40 | If you are not familiar with BERT, I suggest you check out the helpful blog post [here](http://jalammar.github.io/illustrated-bert/), and the resources linked from it. 41 | 42 | Until you have time to do that, this vignette is intended to be a quick, bare-bones introduction to BERT--just enough so that the rest of this package makes sense. 43 | 44 | ## What is BERT? 45 | 46 | For now, think of BERT as a function that takes in text and puts out numbers--a long list of numbers for *each* token (~word) of the input. 47 | 48 | In fact, there is really a family of such functions. 49 | Google research released the first BERT models in late 2018, and others have followed (as well as many that include slight modifications to the original structure). 50 | When referring to any results using "BERT", it is important to specify which BERT you're talking about. 51 | 52 | ## The input to BERT 53 | 54 | ### Tokenization 55 | 56 | BERT takes "natural" text as input, with some restrictions. 57 | The first thing that BERT does is to [tokenize](https://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) the input text. 58 | Most common words will be tokenized as themselves, but words that are not included in the vocabulary of that particular version of BERT will be tokenized as two or more "word pieces". 59 | Each character of punctuation also gets its own token. 60 | Finally, BERT adds a few special delimiter tokens to each piece of input. 61 | 62 | For example, "Who doesn't like tacos?" might be tokenized as: 63 | 64 | `[CLS] who doesn ' t like ta ##cos ? [SEP]` 65 | 66 | The "##" bit indicates that "cos" was originally attached to "ta". 67 | The word "tacos" was split up this way because that word isn't found in (this version of) BERT's limited vocabulary. 68 | 69 | Any output from BERT is organized in terms of tokens like this. 70 | 71 | ### Sequences 72 | 73 | Current BERT models can process chunks of text that consist of no more than 512 tokens (so the maximum number of words is rather fewer than that in practice). 74 | If you have text longer than that, you will need to find some way of splitting it up. 75 | A natural way of splitting up your text is by individual sentences, if possible. 76 | In this package (consistently) and in other literature (perhaps less consistently) a chunk of text processed by BERT is referred to as a "sequence". 77 | So a list of such chunks may be indexed by a "sequence_index", for example. 78 | 79 | ### Segments 80 | 81 | A sequence *may* be divided into two [^segments] "segments". 82 | This is useful when your particular application calls for two distinct pieces of text to be input (e.g. a model that evaluates the logical compatibility of two statements). 83 | Note that the sequence as a whole still can't exceed the token limit, so splitting your text into segments is not a way to input longer text. 84 | In fact, a delimiter token is required to separate the two segments, which counts against the 512 limit, so you'll actually lose a bit of capacity by using segments. 85 | 86 | When possible, it's probably best to use individual sentences as your input sequences (or segments, if you're going that way). 87 | BERT was trained at the sentence level, and you're less likely to hit the token limit with individual sentences than with, say, paragraphs of text. 88 | 89 | [^segments]: In principle, the input could have any number of segments, but the BERT models are limited to two segments. 90 | 91 | ## The output of BERT 92 | 93 | ### Embeddings 94 | 95 | One way of thinking about BERT is as a machine for producing context-dependent [embeddings](https://en.wikipedia.org/wiki/Word_embedding). 96 | Here, an "embedding" is a vector [^vector] that gets associated with each token of the input. 97 | Useful embeddings will have a number of special properties. 98 | For example, tokens with similar meanings will have embeddings that are "nearby" in the embedding space. 99 | 100 | Static embeddings, such as [word2vec](https://en.wikipedia.org/wiki/Word2vec), have been around for several years. 101 | However, they typically have been unable to distinguish between [homographs](https://en.wikipedia.org/wiki/Homograph), such as "train [teach]" and "train [locomotive]". 102 | More generally, such embeddings are insensitive to word order, sentence structure, and other contextual cues. 103 | 104 | [^vector]: Think, "point in a high-dimensional space." 105 | 106 | ### Context-dependent embeddings 107 | 108 | In contrast, BERT's output can be understood as embedding vectors that *are* appropriately sensitive to the context in which each word is used. 109 | Not only does this make it possible to give homographs their own embeddings, it also allows more subtle differences in meaning and usage to be picked up. 110 | 111 | BERT outputs an embedding vector for each input token, including the special tokens "[CLS]" and "[SEP]". 112 | The vector for [CLS] can be thought of as "pure context"; it's the embedding of a token that has no intrinsic meaning, but is still sensitive to the context around it.[^CLS] 113 | 114 | BERT has a layered structure (see next section), and output embedding vectors can be obtained at each layer. 115 | 116 | [^CLS]: The interpretation of [CLS] is a bit more nuanced than this simple explanation implies. 117 | See [this discussion](https://github.com/google-research/bert/issues/196) for more details. 118 | 119 | ## The insides of BERT 120 | 121 | Context-dependence is achieved through the "attention" mechanism. 122 | In very cartoony terms, "attention" provides a way for each token to "choose" (based on training) which of its surrounding tokens to be most influenced by. 123 | BERT consists of multiple sequential layers of attention, with multiple "heads" per layer. 124 | Each head may split its attention across any of the input tokens (including itself). 125 | It may be helpful to picture each token processed by BERT as a many-headed beast, able to attend at each moment to any or all of its neighbors, and modify itself slightly in the next moment based on what it sees. 126 | 127 | The amount of attention paid by each token, to each token, in each layer and head, can be represented by a weight that is normalized to one for each "attender". 128 | While these weights are not part of the formal output of BERT, it can be instructive to study them to better understand what aspects of language BERT models well. 129 | 130 | RBERT makes it easy (using the `extract_features` function) to obtain both the attention weights and the token embeddings at each layer of a BERT model. 131 | -------------------------------------------------------------------------------- /vignettes/RBERT_intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introduction to RBERT" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{Introduction to RBERT} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | 25 | 26 | ```{r, include = FALSE} 27 | knitr::opts_chunk$set( 28 | collapse = TRUE, 29 | comment = "#>" 30 | ) 31 | ``` 32 | 33 | 34 | 35 | 36 | 37 | RBERT is an implementation of Google Research's 38 | [BERT](https://github.com/google-research/bert) in `R`. 39 | 40 | BERT is a powerful general-purpose language model 41 | (paper [here](https://arxiv.org/pdf/1810.04805.pdf), 42 | helpful blog post [here](http://jalammar.github.io/illustrated-bert/)). 43 | BERT is written in Python, using [TensorFlow](https://www.tensorflow.org/). 44 | An `R` package for TensorFlow already [exists](https://tensorflow.rstudio.com/), 45 | so the goal of this project is to fully implement BERT in `R` down to the level 46 | of the TensorFlow API. 47 | 48 | Generally speaking, there are three levels at which BERT could be used: 49 | 50 | 1. Using the output of a pre-trained BERT model as features for downstream model 51 | 2. Fine-tuning on top of a pre-trained BERT model 52 | 3. Training a BERT model from scratch 53 | 54 | Currently, RBERT is functional at the first level, and possibly functional 55 | at the second level (speed becomes a significant consideration at this level). 56 | 57 | # Getting started with RBERT 58 | 59 | RBERT requires the tensorflow package to be installed and working. If that 60 | requirement is met, using RBERT at the first level is fairly 61 | straightforward. 62 | 63 | ```{r eval = FALSE} 64 | library(RBERT) 65 | library(dplyr) 66 | 67 | # Download pre-trained BERT model. This will go to an appropriate cache 68 | # directory by default. 69 | BERT_PRETRAINED_DIR <- RBERT::download_BERT_checkpoint( 70 | model = "bert_base_uncased" 71 | ) 72 | 73 | text_to_process <- c("Impulse is equal to the change in momentum.", 74 | "Changing momentum requires an impulse.", 75 | "An impulse is like a push.", 76 | "Impulse is force times time.") 77 | 78 | # Or make two-segment examples: 79 | text_to_process2 <- list(c("Impulse is equal to the change in momentum.", 80 | "Changing momentum requires an impulse."), 81 | c("An impulse is like a push.", 82 | "Impulse is force times time.")) 83 | 84 | BERT_feats <- extract_features( 85 | examples = text_to_process2, 86 | ckpt_dir = BERT_PRETRAINED_DIR, 87 | layer_indexes = 1:12 88 | ) 89 | 90 | # Extract the final layer output vector for the "[CLS]" token of the first 91 | # sentence. 92 | output_vector1 <- BERT_feats$output %>% 93 | dplyr::filter( 94 | sequence_index == 1, 95 | token == "[CLS]", 96 | layer_index == 12 97 | ) %>% 98 | dplyr::select(dplyr::starts_with("V")) %>% 99 | unlist() 100 | output_vector1 101 | 102 | # Extract output vectors for all sentences... 103 | # These vectors can be used as input features for downstream models. 104 | # Convenience functions for doing this extraction will be added to the 105 | # package in the near future. 106 | output_vectors <- BERT_feats$output %>% 107 | dplyr::filter(token_index == 1, layer_index == 12) 108 | output_vectors 109 | 110 | ``` 111 | 112 | # Other Functions 113 | 114 | RBERT exports a couple of functions that may be helpful when exploring BERT. 115 | 116 | ```{r, eval = FALSE} 117 | # Both of the functions below require a vocabulary (or a checkpoint containing a 118 | # vocab.txt file) to be specified. 119 | BERT_PRETRAINED_DIR <- download_BERT_checkpoint("bert_base_uncased") 120 | 121 | # `tokenize_text` is a quick way to see the wordpiece tokenization of some text. 122 | tokens <- tokenize_text(text = "Who doesn't like tacos?", 123 | ckpt_dir = BERT_PRETRAINED_DIR) 124 | # [[1]] 125 | # [1] "[CLS]" "who" "doesn" "'" "t" "like" "ta" "##cos" 126 | # [9] "?" "[SEP]" 127 | 128 | # `check_vocab` checks whether the given words are found in the vocabulary. 129 | check_vocab(words = c("positron", "electron"), ckpt_dir = BERT_PRETRAINED_DIR) 130 | # [1] FALSE TRUE 131 | ``` 132 | 133 | 134 | # Future work 135 | 136 | There's still a lot to do! Check out the 137 | [issues board](https://github.com/macmillanhighered/RBERT/issues) 138 | on the github page. 139 | --------------------------------------------------------------------------------