├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── Contributor_Code_of_Conduct.md
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── NEWS.md
├── R
    ├── download_checkpoint.R
    ├── extract_features.R
    ├── modeling.R
    ├── optimization.R
    ├── run_classifier.R
    ├── sysdata.rda
    ├── tokenization.R
    └── utils.R
├── RBERT.Rproj
├── README.Rmd
├── README.md
├── appveyor.yml
├── codecov.yml
├── data-raw
    └── sysdata.R
├── man
    ├── AdamWeightDecayOptimizer.Rd
    ├── BasicTokenizer.Rd
    ├── BertConfig.Rd
    ├── BertModel.Rd
    ├── FullTokenizer.Rd
    ├── InputExample.Rd
    ├── InputExample_EF.Rd
    ├── InputFeatures.Rd
    ├── WordpieceTokenizer.Rd
    ├── apply_to_chars.Rd
    ├── assert_rank.Rd
    ├── attention_layer.Rd
    ├── bert_config_from_json_file.Rd
    ├── check_vocab.Rd
    ├── clean_text.Rd
    ├── convert_by_vocab.Rd
    ├── convert_examples_to_features.Rd
    ├── convert_single_example.Rd
    ├── convert_to_unicode.Rd
    ├── create_attention_mask_from_input_mask.Rd
    ├── create_initializer.Rd
    ├── create_model.Rd
    ├── create_optimizer.Rd
    ├── dot-InputFeatures_EF.Rd
    ├── dot-choose_BERT_dir.Rd
    ├── dot-convert_examples_to_features_EF.Rd
    ├── dot-convert_single_example_EF.Rd
    ├── dot-download_BERT_checkpoint.Rd
    ├── dot-get_actual_index.Rd
    ├── dot-get_model_archive_path.Rd
    ├── dot-get_model_archive_type.Rd
    ├── dot-get_model_subdir.Rd
    ├── dot-get_model_url.Rd
    ├── dot-has_checkpoint.Rd
    ├── dot-infer_archive_type.Rd
    ├── dot-infer_checkpoint_archive_path.Rd
    ├── dot-infer_ckpt_dir.Rd
    ├── dot-infer_model_paths.Rd
    ├── dot-maybe_download_checkpoint.Rd
    ├── dot-model_fn_builder_EF.Rd
    ├── dot-process_BERT_checkpoint.Rd
    ├── download_BERT_checkpoint.Rd
    ├── dropout.Rd
    ├── embedding_lookup.Rd
    ├── embedding_postprocessor.Rd
    ├── extract_features.Rd
    ├── figures
    │   ├── rbert_hex.png
    │   └── rbert_hex.svg
    ├── file_based_convert_examples_to_features.Rd
    ├── file_based_input_fn_builder.Rd
    ├── find_files.Rd
    ├── gelu.Rd
    ├── get_activation.Rd
    ├── get_assignment_map_from_checkpoint.Rd
    ├── get_shape_list.Rd
    ├── input_fn_builder.Rd
    ├── input_fn_builder_EF.Rd
    ├── is_chinese_char.Rd
    ├── is_control.Rd
    ├── is_punctuation.Rd
    ├── is_whitespace.Rd
    ├── layer_norm.Rd
    ├── layer_norm_and_dropout.Rd
    ├── load_vocab.Rd
    ├── make_examples_simple.Rd
    ├── model_fn_builder.Rd
    ├── reshape_from_matrix.Rd
    ├── reshape_to_matrix.Rd
    ├── set_BERT_dir.Rd
    ├── split_on_punc.Rd
    ├── strip_accents.Rd
    ├── tokenize.Rd
    ├── tokenize_chinese_chars.Rd
    ├── tokenize_text.Rd
    ├── tokenize_word.Rd
    ├── transformer_model.Rd
    ├── transpose_for_scores.Rd
    ├── truncate_seq_pair.Rd
    └── whitespace_tokenize.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── attention_probs.rds
    │   ├── bert_config.json
    │   ├── sample_amap.rds
    │   ├── sample_feat_in.rds
    │   ├── sample_feats.rds
    │   ├── setup.R
    │   ├── teardown.R
    │   ├── test-download_checkpoint.R
    │   ├── test-extract_features.R
    │   ├── test-modeling.R
    │   ├── test-optimization.R
    │   ├── test-run_classifier.R
    │   ├── test-tokenization.R
    │   ├── test_checkpoints
    │       ├── .gitignore
    │       └── testing_checkpoint.tar.gz
    │   ├── tokenizer.rds
    │   ├── vocab.txt
    │   ├── vocab0.txt
    │   └── vocab_small.txt
└── vignettes
    ├── .gitignore
    ├── BERT_basics.Rmd
    └── RBERT_intro.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^CONTRIBUTING\.md$
 4 | ^LICENSE\.md$
 5 | ^Contributor_Code_of_Conduct\.md$
 6 | ^README\.Rmd$
 7 | ^\.travis\.yml$
 8 | ^appveyor\.yml$
 9 | ^codecov\.yml$
10 | ^data-raw$
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | vocab_small.txt
 6 | vocab.txt
 7 | inst/doc
 8 | .httr-oauth
 9 | README.html
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: R
 4 | warnings_are_errors: false
 5 | 
 6 | env:
 7 |   global:
 8 |     - TENSORFLOW_TEST_EXAMPLES="1"
 9 | 
10 | cache:
11 |   packages: true
12 |   directories:
13 |     - $HOME/.cache/pip
14 | 
15 | addons:
16 |   apt:
17 |     packages:
18 |       python3-dev
19 |       python3-pip
20 |       python3-virtualenv
21 |       python3-venv
22 | 
23 | before_script:
24 |   - python3 -m pip install --upgrade --ignore-installed --user travis virtualenv
25 |   - R CMD INSTALL .
26 |   - R -e 'tensorflow::install_tensorflow(version = "1.11.0", extra_packages="IPython")'
27 |   - R -e 'tensorflow::tf_config()'
28 | 
29 | after_success:
30 |   - Rscript -e 'covr::codecov()'
31 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## Contributing
 2 | 
 3 | Want to contribute? Great! First, read this page (including the small print at the end).
 4 | 
 5 | ### Before you contribute
 6 | Before we can use your code, you must sign the Macmillan Learning Individual Contributor License Agreement (CLA), which you will be asked to do automatically when you submit a pull request. The CLA is necessary mainly because you own the copyright to your changes, even after your contribution becomes part of our codebase, so we need your permission to use and distribute your code. We also need to be sure of various other things—for instance that you'll tell us if you know that your code infringes on other people's patents. You don't have to sign the CLA until after you've submitted your code for review and a member has approved it, but you must do it before we can put your code into our codebase. 
 7 | Before you start working on a larger contribution, you should get in touch with us first through the issue tracker with your idea so that we can help out and possibly guide you. Coordinating up front makes it much easier to avoid frustration later on.
 8 | 
 9 | ### Making changes
10 | 
11 | We use the github [fork and pull review process](https://help.github.com/articles/using-pull-requests) to review all contributions. First, fork the repository by following the [github instructions](https://help.github.com/articles/fork-a-repo). Then check out your personal fork:
12 | 
13 |     $ git clone https://github.com/<username>/RBERT.git
14 | 
15 | Add an upstream remote so you can easily keep up to date with the main repository:
16 | 
17 |     $ git remote add upstream https://github.com/jonathanbratt/RBERT.git
18 | 
19 | To update your local repo from the main:
20 | 
21 |     $ git pull upstream master
22 | 
23 | When you're done making changes, make sure tests pass, and then commit your changes to your personal fork. Then use the GitHub Web UI to create and send the pull request. We'll review and merge the change.
24 | 
25 | 
26 | ### Code review
27 | 
28 | All submissions, including submissions by project members, require review. To keep the code base maintainable and readable, all code is developed using a similar coding style. We typically follow the [tidyverse style guide](https://style.tidyverse.org/), with minor changes.
29 | 
30 | 
31 | The code should be easy to maintain and understand. It is important that you be able to come back, months later, to code that you've written and still quickly understand what it is supposed to be doing. Understandable code also makes it easier for other people to contribute. Quick-and-dirty solutions or "clever" coding tricks might work in the short term, but should be avoided in the interest of long term code quality.
32 | 
33 | With the code review process, we ensure that at least two sets of eyes looked over the code in hopes of finding potential bugs or errors (before they become bugs and errors). This also improves the overall code quality and makes sure that every developer knows to (largely) expect the same coding style.
34 | 
35 | 
36 | [Unit tests](https://testthat.r-lib.org/) are an important part of the code. We aim for 100% test coverage, while recognizing that some functionality may be hard to cover in a unit test. 
37 | 
38 | 
39 | 
40 | ### The small print 
41 | 
42 | Contributions made by corporations will be covered by a
43 | different agreement than the one above. Contact us if this applies to you.
44 | 


--------------------------------------------------------------------------------
/Contributor_Code_of_Conduct.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at jon.harmon@macmillan.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: RBERT
 2 | Type: Package
 3 | Title: R Implementation of BERT
 4 | Version: 0.1.11
 5 | Authors@R: c(
 6 |     person("Jonathan", "Bratt", email = "jonathan.bratt@macmillan.com", 
 7 |            role = c("aut", "cre")
 8 |     ),
 9 |     person("Jon", "Harmon", email = "jon.harmon@macmillan.com", 
10 |            role = c("aut")
11 |     ),
12 |     person(family = "Google Inc.", role = c("ctb", "cph"),
13 |            comment = "Original Python code; Examples and Tutorials")
14 |     )
15 | Description: Use pretrained models from Google Research's BERT in R.
16 | Encoding: UTF-8
17 | LazyData: true
18 | URL: https://github.com/jonathanbratt/RBERT
19 | BugReports: https://github.com/jonathanbratt/RBERT/issues
20 | Depends: R (>= 3.5.1)
21 | License: file LICENSE
22 | RoxygenNote: 7.0.2
23 | Imports:
24 |     dplyr (>= 0.8.3),
25 |     jsonlite (>= 1.6),
26 |     purrr (>= 0.3.0),
27 |     rappdirs (>= 0.3.1),
28 |     reticulate (>= 1.12),
29 |     stringi (>= 1.2.4),
30 |     stringr (>= 1.4.0),
31 |     tensorflow (>= 1.10),
32 |     tibble (>= 2.1.3),
33 |     tidyr (>= 1.0.0),
34 |     utils
35 | Suggests: 
36 |     testthat (>= 2.1.0),
37 |     mockery,
38 |     knitr,
39 |     rmarkdown,
40 |     covr
41 | VignetteBuilder: knitr
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                 Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(tokenize,BasicTokenizer)
 4 | S3method(tokenize,FullTokenizer)
 5 | S3method(tokenize,WordpieceTokenizer)
 6 | export(AdamWeightDecayOptimizer)
 7 | export(BasicTokenizer)
 8 | export(BertConfig)
 9 | export(BertModel)
10 | export(FullTokenizer)
11 | export(InputExample)
12 | export(InputExample_EF)
13 | export(InputFeatures)
14 | export(WordpieceTokenizer)
15 | export(assert_rank)
16 | export(attention_layer)
17 | export(bert_config_from_json_file)
18 | export(check_vocab)
19 | export(convert_by_vocab)
20 | export(convert_examples_to_features)
21 | export(convert_ids_to_tokens)
22 | export(convert_single_example)
23 | export(convert_to_unicode)
24 | export(convert_tokens_to_ids)
25 | export(create_attention_mask_from_input_mask)
26 | export(create_initializer)
27 | export(create_model)
28 | export(create_optimizer)
29 | export(download_BERT_checkpoint)
30 | export(dropout)
31 | export(embedding_lookup)
32 | export(embedding_postprocessor)
33 | export(extract_features)
34 | export(file_based_convert_examples_to_features)
35 | export(file_based_input_fn_builder)
36 | export(find_ckpt)
37 | export(find_config)
38 | export(find_vocab)
39 | export(gelu)
40 | export(get_activation)
41 | export(get_assignment_map_from_checkpoint)
42 | export(get_shape_list)
43 | export(input_fn_builder)
44 | export(layer_norm)
45 | export(layer_norm_and_dropout)
46 | export(load_vocab)
47 | export(make_examples_simple)
48 | export(model_fn_builder)
49 | export(reshape_from_matrix)
50 | export(reshape_to_matrix)
51 | export(set_BERT_dir)
52 | export(tokenize)
53 | export(tokenize_text)
54 | export(tokenize_word)
55 | export(transformer_model)
56 | export(truncate_seq_pair)
57 | export(whitespace_tokenize)
58 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # RBERT 0.1.11
 2 | 
 3 | * Added parameter to shush verbose `extract_features`.
 4 | * Removed vestigal `use_one_hot_embeddings` parameter from everywhere.
 5 | * `extract_features` now can take plain character vectors as input.
 6 | * `extract_features` now can take a single checkpoint directory or model name,
 7 | rather than three separate file paths.
 8 | 
 9 | # RBERT 0.1.7
10 | 
11 | * Updated `extract_features` to return tidy tibbles (@jonthegeek, #29)
12 | 
13 | # RBERT 0.1.6
14 | 
15 | * Updated `download_BERT_checkpoint` to simplify usage. (@jonthegeek, #25)
16 | 
17 | # RBERT 0.1.0
18 | 
19 | * Added a `NEWS.md` file to track changes to the package.
20 | * Initial open source release.
21 | 


--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/R/sysdata.rda


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # find checkpoint files ---------------------------------------------------
 16 | 
 17 | 
 18 | #' Find Checkpoint Files
 19 | #'
 20 | #' Given the path to a checkpoint directory, return the paths to certain files
 21 | #' in that directory.
 22 | #'
 23 | #' @param ckpt_dir Character; the path to the checkpoint directory. If this
 24 | #'   argument is NULL, the associated functions also return NULL.
 25 | #' @name find_files
 26 | NULL
 27 | 
 28 | #' @describeIn find_files Find the vocabulary file ('vocab.txt').
 29 | #' @export
 30 | find_vocab <- function(ckpt_dir) {
 31 |   # If this gets called for a NULL ckpt_dir, return NULL.
 32 |   if (is.null(ckpt_dir)) {
 33 |     return(NULL)
 34 |   }
 35 | 
 36 |   vocab_file <- file.path(ckpt_dir, "vocab.txt")
 37 |   if (file.exists(vocab_file)) {
 38 |     return(vocab_file)
 39 |   } else {
 40 |     stop("No file named 'vocab.txt' found in ", ckpt_dir) # nocov
 41 |   }
 42 | }
 43 | 
 44 | 
 45 | #' @describeIn find_files Find the config file ('bert_config.json').
 46 | #' @export
 47 | find_config <- function(ckpt_dir) {
 48 |   # If this gets called for a NULL ckpt_dir, return NULL.
 49 |   if (is.null(ckpt_dir)) {
 50 |     return(NULL)
 51 |   }
 52 | 
 53 |   config_file <- file.path(ckpt_dir, "bert_config.json")
 54 |   if (file.exists(config_file)) {
 55 |     return(config_file)
 56 |   } else {
 57 |     stop("No file named 'bert_config.json' found in ", ckpt_dir) # nocov
 58 |   }
 59 | }
 60 | 
 61 | #' @describeIn find_files Find the checkpoint file stub (files begin with
 62 | #'   'bert_model.ckpt').
 63 | #' @export
 64 | find_ckpt <- function(ckpt_dir) {
 65 |   # If this gets called for a NULL ckpt_dir, return NULL.
 66 |   if (is.null(ckpt_dir)) {
 67 |     return(NULL)
 68 |   }
 69 | 
 70 |   # The path we want to return here isn't an actual file, but a name stub for
 71 |   # files with suffixes '.index', '.meta', etc.
 72 |   ckpt_filestub <- file.path(ckpt_dir, "bert_model.ckpt")
 73 |   ckpt_file1 <- file.path(ckpt_dir, "bert_model.ckpt.index")
 74 |   ckpt_file2 <- file.path(ckpt_dir, "bert_model.ckpt.meta")
 75 |   if (file.exists(ckpt_file1) & file.exists(ckpt_file2)) {
 76 |     return(ckpt_filestub)
 77 |   } else {
 78 |     stop("Checkpoint file(s) missing from ", ckpt_dir) # nocov
 79 |   }
 80 | }
 81 | 
 82 | #' Find Paths to Checkpoint Files
 83 | #'
 84 | #' In some functions, the user can specify a model, a ckpt_dir, and/or specific
 85 | #' paths to checkpoint files. This function sorts all of that out.
 86 | #'
 87 | #' @inheritParams extract_features
 88 | #' @return A list with components vocab_file, bert_config_file, and
 89 | #'   init_checkpoint.
 90 | #' @keywords internal
 91 | .infer_model_paths <- function(model = c(
 92 |                                  "bert_base_uncased",
 93 |                                  "bert_base_cased",
 94 |                                  "bert_large_uncased",
 95 |                                  "bert_large_cased",
 96 |                                  "bert_large_uncased_wwm",
 97 |                                  "bert_large_cased_wwm",
 98 |                                  "bert_base_multilingual_cased",
 99 |                                  "bert_base_chinese",
100 |                                  "scibert_scivocab_uncased",
101 |                                  "scibert_scivocab_cased",
102 |                                  "scibert_basevocab_uncased",
103 |                                  "scibert_basevocab_cased"
104 |                                ),
105 |                                ckpt_dir = NULL,
106 |                                vocab_file = find_vocab(ckpt_dir),
107 |                                bert_config_file = find_config(ckpt_dir),
108 |                                init_checkpoint = find_ckpt(ckpt_dir)) {
109 |   # Deal with the fact that model will never be *missing* when this function is
110 |   # called, but we don't want the calling functions to have to deal with parsing
111 |   # the argument.
112 |   if (length(model) > 1) {
113 |     model <- NULL
114 |   } else {
115 |     model <- match.arg(model)
116 |   }
117 | 
118 |   # If any of the necessary files aren't specified, try to find them. This would
119 |   # most likely only happen if they specified one file but not all (and left
120 |   # ckpt_dir as NULL), but run this to be sure.
121 |   vocab_file <- vocab_file %||% find_vocab(ckpt_dir)
122 |   bert_config_file <- bert_config_file %||% find_config(ckpt_dir)
123 |   init_checkpoint <- init_checkpoint %||% find_ckpt(ckpt_dir)
124 | 
125 |   # At this point either we have the paths, or we need to infer from the model.
126 |   if ((is.null(vocab_file) |
127 |     is.null(bert_config_file) |
128 |     is.null(init_checkpoint))) {
129 |     if (is.null(model)) {
130 |       stop(
131 |         "You must specify a model, a ckpt_dir, or the locations of ",
132 |         "vocab_file, bert_config_file, and init_checkpoint."
133 |       )
134 |     } else {
135 |       dir <- .choose_BERT_dir(NULL)
136 |       ckpt_dir <- .get_model_subdir(model, dir)
137 |       .maybe_download_checkpoint(
138 |         model = model,
139 |         dir = dir,
140 |         ckpt_dir = ckpt_dir
141 |       )
142 | 
143 |       # If we made it here, they have the model, so set the file locations.
144 |       vocab_file <- find_vocab(ckpt_dir)
145 |       bert_config_file <- find_config(ckpt_dir)
146 |       init_checkpoint <- find_ckpt(ckpt_dir)
147 |     }
148 |   }
149 |   return(
150 |     list(
151 |       vocab_file = vocab_file,
152 |       bert_config_file = bert_config_file,
153 |       init_checkpoint = init_checkpoint
154 |     )
155 |   )
156 | }
157 | 


--------------------------------------------------------------------------------
/RBERT.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 6 | 
 7 | ```{r, include = FALSE}
 8 | knitr::opts_chunk$set(
 9 |   collapse = TRUE,
10 |   comment = "#>",
11 |   fig.path = "man/figures/README-",
12 |   out.width = "100%"
13 | )
14 | ```
15 | # RBERT <img src='man/figures/rbert_hex.png' align="right" height="138.5" />
16 | 
17 | <!-- badges: start -->
18 | [![Lifecycle: superseded](https://img.shields.io/badge/lifecycle-superseded-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html#superseded)
19 | [![Travis build status](https://travis-ci.org/jonathanbratt/RBERT.svg?branch=master)](https://travis-ci.org/jonathanbratt/RBERT)
20 | [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/jonathanbratt/RBERT?branch=master&svg=true)](https://ci.appveyor.com/project/jonathanbratt/RBERT)
21 | [![Codecov test coverage](https://codecov.io/gh/jonathanbratt/RBERT/branch/master/graph/badge.svg)](https://codecov.io/gh/jonathanbratt/RBERT?branch=master)
22 | <!-- badges: end -->
23 | 
24 | We are re-implementing BERT for R in [{torchtransformers}](https://github.com/macmillancontentscience/torchtransformers). We find {torch} much easier to work with in R than {tensorflow}, and strongly recommend starting there!
25 | 
26 | ---
27 | 
28 | RBERT is an R implementation of the Python package [BERT](https://github.com/google-research/bert) developed at Google for Natural Language Processing.
29 | 
30 | ## Installation
31 | 
32 | You can install RBERT from [GitHub](https://github.com/) with:
33 | 
34 | ```{r installation, eval = FALSE}
35 | # install.packages("devtools")
36 | devtools::install_github(
37 |   "jonathanbratt/RBERT", 
38 |   build_vignettes = TRUE
39 | )
40 | ```
41 | 
42 | ### TensorFlow Installation
43 | 
44 | RBERT requires TensorFlow. Currently the version must be <= 1.13.1. You can install it using the tensorflow package (installed as a dependency of this package; see note below about Windows).
45 | 
46 | ```{r tensorflow, eval = FALSE}
47 | tensorflow::install_tensorflow(version = "1.13.1")
48 | ```
49 | 
50 | ### Windows
51 | 
52 | The current CRAN version of reticulate (1.13) causes some issues with the tensorflow installation. Rebooting your machine after installing Anaconda seems to fix this issue, or upgrade to the development version of reticulate.
53 | 
54 | ```{r install dev reticulate, eval = FALSE}
55 | devtools::install_github("rstudio/reticulate")
56 | ```
57 | 
58 | ## Basic usage
59 | 
60 | RBERT is a work in progress. While fine-tuning a BERT model using RBERT may be possible, it is not currently recommended.
61 | 
62 | RBERT is best suited for exploring pre-trained BERT models, and obtaining contextual representations of input text for use as features in downstream tasks.
63 | 
64 | * See the "Introduction to RBERT" vignette included with the package for more specific examples.
65 | * For a quick explanation of what BERT is, see the "BERT Basics" vignette.
66 | * The package [RBERTviz](https://github.com/jonathanbratt/RBERTviz) provides tools for making fun and easy visualizations of BERT data.
67 | 
68 | ## Running Tests
69 | 
70 | The first time you run the test suite, the 388.8MB bert_base_uncased.zip file will download in your `tests/testthat/test_checkpoints` directory. Subsequent test runs will use that download. This was our best compromise to allow for relatively rapid testing without bloating the repository.
71 | 
72 | ## Disclaimer
73 | 
74 | This is not an officially supported Macmillan Learning product.
75 | 
76 | ## Contact information
77 | 
78 | Questions or comments should be directed to Jonathan Bratt (jonathan.bratt@macmillan.com) and Jon Harmon (jon.harmon@macmillan.com).
79 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 3 | 
 4 | # RBERT <img src='man/figures/rbert_hex.png' align="right" height="138.5" />
 5 | 
 6 | <!-- badges: start -->
 7 | 
 8 | [![Lifecycle:
 9 | superseded](https://img.shields.io/badge/lifecycle-superseded-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html#superseded)
10 | [![Travis build
11 | status](https://travis-ci.org/jonathanbratt/RBERT.svg?branch=master)](https://travis-ci.org/jonathanbratt/RBERT)
12 | [![AppVeyor build
13 | status](https://ci.appveyor.com/api/projects/status/github/jonathanbratt/RBERT?branch=master&svg=true)](https://ci.appveyor.com/project/jonathanbratt/RBERT)
14 | [![Codecov test
15 | coverage](https://codecov.io/gh/jonathanbratt/RBERT/branch/master/graph/badge.svg)](https://codecov.io/gh/jonathanbratt/RBERT?branch=master)
16 | <!-- badges: end -->
17 | 
18 | We are re-implementing BERT for R in
19 | [{torchtransformers}](https://github.com/macmillancontentscience/torchtransformers).
20 | We find {torch} much easier to work with in R than {tensorflow}, and
21 | strongly recommend starting there!
22 | 
23 | ------------------------------------------------------------------------
24 | 
25 | RBERT is an R implementation of the Python package
26 | [BERT](https://github.com/google-research/bert) developed at Google for
27 | Natural Language Processing.
28 | 
29 | ## Installation
30 | 
31 | You can install RBERT from [GitHub](https://github.com/) with:
32 | 
33 | ``` r
34 | # install.packages("devtools")
35 | devtools::install_github(
36 |   "jonathanbratt/RBERT", 
37 |   build_vignettes = TRUE
38 | )
39 | ```
40 | 
41 | ### TensorFlow Installation
42 | 
43 | RBERT requires TensorFlow. Currently the version must be \<= 1.13.1. You
44 | can install it using the tensorflow package (installed as a dependency
45 | of this package; see note below about Windows).
46 | 
47 | ``` r
48 | tensorflow::install_tensorflow(version = "1.13.1")
49 | ```
50 | 
51 | ### Windows
52 | 
53 | The current CRAN version of reticulate (1.13) causes some issues with
54 | the tensorflow installation. Rebooting your machine after installing
55 | Anaconda seems to fix this issue, or upgrade to the development version
56 | of reticulate.
57 | 
58 | ``` r
59 | devtools::install_github("rstudio/reticulate")
60 | ```
61 | 
62 | ## Basic usage
63 | 
64 | RBERT is a work in progress. While fine-tuning a BERT model using RBERT
65 | may be possible, it is not currently recommended.
66 | 
67 | RBERT is best suited for exploring pre-trained BERT models, and
68 | obtaining contextual representations of input text for use as features
69 | in downstream tasks.
70 | 
71 | -   See the “Introduction to RBERT” vignette included with the package
72 |     for more specific examples.
73 | -   For a quick explanation of what BERT is, see the “BERT Basics”
74 |     vignette.
75 | -   The package [RBERTviz](https://github.com/jonathanbratt/RBERTviz)
76 |     provides tools for making fun and easy visualizations of BERT data.
77 | 
78 | ## Running Tests
79 | 
80 | The first time you run the test suite, the 388.8MB bert_base_uncased.zip
81 | file will download in your `tests/testthat/test_checkpoints` directory.
82 | Subsequent test runs will use that download. This was our best
83 | compromise to allow for relatively rapid testing without bloating the
84 | repository.
85 | 
86 | ## Disclaimer
87 | 
88 | This is not an officially supported Macmillan Learning product.
89 | 
90 | ## Contact information
91 | 
92 | Questions or comments should be directed to Jonathan Bratt
93 | (<jonathan.bratt@macmillan.com>) and Jon Harmon
94 | (<jon.harmon@macmillan.com>).
95 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # DO NOT CHANGE the "init" and "install" sections below
 2 | 
 3 | environment:
 4 |   global:
 5 |     USE_RTOOLS: true
 6 |     R_REMOTES_STANDALONE: true
 7 |   matrix:
 8 |     - TF_VERSION: 1.12.0
 9 |     - TF_VERSION: 1.13.1
10 |   # - TF_VERSION: 1.14.0
11 |   # - TF_VERSION: 2.0.0-rc0
12 | 
13 | # Download script file from GitHub
14 | init:
15 |   ps: |
16 |         $ErrorActionPreference = "Stop"
17 |         Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
18 |         Import-Module '..\appveyor-tool.ps1'
19 | 
20 | install:
21 |   ps: Bootstrap
22 | 
23 | cache:
24 |   - C:\RLibrary
25 | 
26 | # Adapt as necessary starting from here
27 | 
28 | build_script:
29 |   - travis-tool.sh install_deps
30 |   - R CMD INSTALL .
31 |   - R -e "install.packages('devtools', repos = 'http://cran.rstudio.com'); devtools::install_github('rstudio/reticulate')"
32 |   - R -e "tensorflow::install_tensorflow(method = 'conda', version = Sys.getenv('TF_VERSION'), extra_packages = 'IPython', envname = 'r-reticulate')"
33 | 
34 | test_script:
35 |   - travis-tool.sh run_tests
36 | 
37 | on_failure:
38 |   - 7z a failure.zip *.Rcheck\*
39 |   - appveyor PushArtifact failure.zip
40 | 
41 | artifacts:
42 |   - path: '*.Rcheck\**\*.log'
43 |     name: Logs
44 | 
45 |   - path: '*.Rcheck\**\*.out'
46 |     name: Logs
47 | 
48 |   - path: '*.Rcheck\**\*.fail'
49 |     name: Logs
50 | 
51 |   - path: '*.Rcheck\**\*.Rout'
52 |     name: Logs
53 | 
54 |   - path: '\*_*.tar.gz'
55 |     name: Bits
56 | 
57 |   - path: '\*_*.zip'
58 |     name: Bits
59 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |     patch:
10 |       default:
11 |         target: auto
12 |         threshold: 1%
13 | 


--------------------------------------------------------------------------------
/data-raw/sysdata.R:
--------------------------------------------------------------------------------
 1 | library(magrittr)
 2 | google_base_url <- "https://storage.googleapis.com/bert_models/"
 3 | scibert_base_url <- "https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/tensorflow_models/"
 4 | 
 5 | checkpoint_url_map <- c(
 6 |   "bert_base_uncased" = paste0(
 7 |     google_base_url,
 8 |     "2018_10_18/uncased_L-12_H-768_A-12.zip"
 9 |   ),
10 |   "bert_base_cased" = paste0(
11 |     google_base_url,
12 |     "2018_10_18/cased_L-12_H-768_A-12.zip"
13 |   ),
14 |   "bert_large_uncased" = paste0(
15 |     google_base_url,
16 |     "2018_10_18/uncased_L-24_H-1024_A-16.zip"
17 |   ),
18 |   "bert_large_cased" = paste0(
19 |     google_base_url,
20 |     "2018_10_18/cased_L-24_H-1024_A-16.zip"
21 |   ),
22 |   "bert_large_uncased_wwm" = paste0(
23 |     google_base_url,
24 |     "2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip"
25 |   ),
26 |   "bert_large_cased_wwm" = paste0(
27 |     google_base_url,
28 |     "2019_05_30/wwm_cased_L-24_H-1024_A-16.zip"
29 |   ),
30 |   "bert_base_multilingual_cased" = paste0(
31 |     google_base_url,
32 |     "2018_11_23/multi_cased_L-12_H-768_A-12.zip"
33 |   ),
34 |   "bert_base_chinese" = paste0(
35 |     google_base_url,
36 |     "2018_11_03/chinese_L-12_H-768_A-12.zip"
37 |   ),
38 |   "scibert_scivocab_uncased" = paste0(
39 |     scibert_base_url,
40 |     "scibert_scivocab_uncased.tar.gz"
41 |   ),
42 |   "scibert_scivocab_cased" = paste0(
43 |     scibert_base_url,
44 |     "scibert_scivocab_cased.tar.gz"
45 |   ),
46 |   "scibert_basevocab_uncased" = paste0(
47 |     scibert_base_url,
48 |     "scibert_basevocab_uncased.tar.gz"
49 |   ),
50 |   "scibert_basevocab_cased" = paste0(
51 |     scibert_base_url,
52 |     "scibert_basevocab_cased.tar.gz"
53 |   )
54 | )
55 | 
56 | # I want to convert this to a tibble with more info, but I don't want to
57 | # reformat all that, so I'm using enframe.
58 | checkpoint_url_map <- tibble::enframe(
59 |   checkpoint_url_map,
60 |   name = "model", value = "url"
61 | ) %>%
62 |   dplyr::mutate(
63 |     archive_type = c(
64 |       rep("zip", 8),
65 |       rep("tar-gzip", 4)
66 |     )
67 |   )
68 | 
69 | usethis::use_data(
70 |   checkpoint_url_map,
71 |   internal = TRUE,
72 |   overwrite = TRUE
73 | )
74 | rm(
75 |   google_base_url,
76 |   scibert_base_url,
77 |   checkpoint_url_map
78 | )
79 | 


--------------------------------------------------------------------------------
/man/AdamWeightDecayOptimizer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/optimization.R
 3 | \name{AdamWeightDecayOptimizer}
 4 | \alias{AdamWeightDecayOptimizer}
 5 | \title{Constructor for objects of class AdamWeightDecayOptimizer}
 6 | \usage{
 7 | AdamWeightDecayOptimizer(
 8 |   learning_rate,
 9 |   weight_decay_rate = 0,
10 |   beta_1 = 0.9,
11 |   beta_2 = 0.999,
12 |   epsilon = 1e-06,
13 |   exclude_from_weight_decay = NULL,
14 |   name = "AdamWeightDecayOptimizer"
15 | )
16 | }
17 | \arguments{
18 | \item{learning_rate}{Numeric Tensor (single element?); learning rate.}
19 | 
20 | \item{weight_decay_rate}{Numeric; weight decay rate.}
21 | 
22 | \item{beta_1}{Numeric; parameter for Adam.}
23 | 
24 | \item{beta_2}{Numeric; parameter for Adam.}
25 | 
26 | \item{epsilon}{Numeric; a tiny number to put a cap on update size by avoiding
27 | dividing by even smaller numbers.}
28 | 
29 | \item{exclude_from_weight_decay}{Character; list of parameter names to
30 | exclude from weight decay.}
31 | 
32 | \item{name}{Character; the name of the constructed object.}
33 | }
34 | \value{
35 | An object of class "AdamWeightDecayOptimizer", which is a (hacky)
36 |   modification of the tf.train.Optimizer class.
37 | }
38 | \description{
39 | A basic Adam optimizer that includes "correct" L2 weight decay.
40 | }
41 | \details{
42 | Inherits from class tf.train.Optimizer.
43 | \url{https://devdocs.io/tensorflow~python/tf/train/optimizer}
44 | }
45 | \examples{
46 | \dontrun{
47 | with(tensorflow::tf$variable_scope("examples",
48 |   reuse = tensorflow::tf$AUTO_REUSE
49 | ), {
50 |   optimizer <- AdamWeightDecayOptimizer(learning_rate = 0.01)
51 | })
52 | }
53 | }
54 | 


--------------------------------------------------------------------------------
/man/BasicTokenizer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{BasicTokenizer}
 4 | \alias{BasicTokenizer}
 5 | \title{Construct objects of BasicTokenizer class.}
 6 | \usage{
 7 | BasicTokenizer(do_lower_case = TRUE)
 8 | }
 9 | \arguments{
10 | \item{do_lower_case}{Logical; the value to give to the "do_lower_case"
11 | argument in the BasicTokenizer object.}
12 | }
13 | \value{
14 | an object of class BasicTokenizer
15 | }
16 | \description{
17 | (I'm not sure that this object-based approach is best for R implementation,
18 | but for now just trying to reproduce python functionality.)
19 | }
20 | \details{
21 | Has methods: `tokenize.BasicTokenizer()` `run_strip_accents.BasicTokenizer()`
22 | (internal use) `run_split_on_punc.BasicTokenizer()` (internal use)
23 | `tokenize_chinese_chars.BasicTokenizer()` (internal use)
24 | `is_chinese_char.BasicTokenizer()` (internal use)
25 | `clean_text.BasicTokenizer()` (internal use)
26 | }
27 | \examples{
28 | \dontrun{
29 | b_tokenizer <- BasicTokenizer(TRUE)
30 | }
31 | }
32 | 


--------------------------------------------------------------------------------
/man/BertConfig.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{BertConfig}
 4 | \alias{BertConfig}
 5 | \title{Construct objects of BertConfig class}
 6 | \usage{
 7 | BertConfig(
 8 |   vocab_size,
 9 |   hidden_size = 768L,
10 |   num_hidden_layers = 12L,
11 |   num_attention_heads = 12L,
12 |   intermediate_size = 3072L,
13 |   hidden_act = "gelu",
14 |   hidden_dropout_prob = 0.1,
15 |   attention_probs_dropout_prob = 0.1,
16 |   max_position_embeddings = 512L,
17 |   type_vocab_size = 16L,
18 |   initializer_range = 0.02
19 | )
20 | }
21 | \arguments{
22 | \item{vocab_size}{Integer; vocabulary size of \code{inputs_ids} in
23 | \code{BertModel}.}
24 | 
25 | \item{hidden_size}{Integer; size of the encoder layers and the pooler layer.}
26 | 
27 | \item{num_hidden_layers}{Integer; number of hidden layers in the Transformer
28 | encoder.}
29 | 
30 | \item{num_attention_heads}{Integer; number of attention heads for each
31 | attention layer in the Transformer encoder.}
32 | 
33 | \item{intermediate_size}{Integer; the size of the "intermediate" (i.e.,
34 | feed-forward) layer in the Transformer encoder.}
35 | 
36 | \item{hidden_act}{The non-linear activation function (function or string) in
37 | the encoder and pooler.}
38 | 
39 | \item{hidden_dropout_prob}{Numeric; the dropout probability for all fully
40 | connected layers in the embeddings, encoder, and pooler.}
41 | 
42 | \item{attention_probs_dropout_prob}{Numeric; the dropout ratio for the
43 | attention probabilities.}
44 | 
45 | \item{max_position_embeddings}{Integer; the maximum sequence length that this
46 | model might ever be used with. Typically set this to something large just
47 | in case (e.g., 512 or 1024 or 2048).}
48 | 
49 | \item{type_vocab_size}{Integer; the vocabulary size of the
50 | \code{token_type_ids} passed into \code{BertModel}.}
51 | 
52 | \item{initializer_range}{Numeric; the stdev of the
53 | truncated_normal_initializer for initializing all weight matrices.}
54 | }
55 | \value{
56 | An object of class BertConfig
57 | }
58 | \description{
59 | Given a set of values as parameter inputs, construct a BertConfig object with
60 | those values.
61 | }
62 | \examples{
63 | \dontrun{
64 | BertConfig(vocab_size = 30522L)
65 | }
66 | }
67 | 


--------------------------------------------------------------------------------
/man/BertModel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{BertModel}
 4 | \alias{BertModel}
 5 | \title{Construct object of class BertModel}
 6 | \usage{
 7 | BertModel(
 8 |   config,
 9 |   is_training,
10 |   input_ids,
11 |   input_mask = NULL,
12 |   token_type_ids = NULL,
13 |   scope = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{config}{\code{BertConfig} instance.}
18 | 
19 | \item{is_training}{Logical; TRUE for training model, FALSE for eval model.
20 | Controls whether dropout will be applied.}
21 | 
22 | \item{input_ids}{Int32 Tensor of shape \code{[batch_size, seq_length]}.}
23 | 
24 | \item{input_mask}{(optional) Int32 Tensor of shape \code{[batch_size,
25 | seq_length]}.}
26 | 
27 | \item{token_type_ids}{(optional) Int32 Tensor of shape \code{[batch_size,
28 | seq_length]}.}
29 | 
30 | \item{scope}{(optional) Character; name for variable scope. Defaults to
31 | "bert".}
32 | }
33 | \value{
34 | An object of class BertModel.
35 | }
36 | \description{
37 | An object of class BertModel has several elements:
38 | \describe{
39 | \item{embedding_output}{float Tensor of shape \code{[batch_size, seq_length,
40 | hidden_size]} corresponding to the output of the embedding layer, after
41 | summing the word embeddings with the positional embeddings and the token type
42 | embeddings, then performing layer normalization. This is the input to the
43 | transformer.}
44 | \item{embedding_table}{The table for the token embeddings.}
45 | \item{all_encoder_layers}{A list of float Tensors of shape \code{[batch_size,
46 | seq_length, hidden_size]}, corresponding to all the hidden transformer
47 | layers.}
48 | \item{sequence_output}{float Tensor of shape \code{[batch_size, seq_length,
49 | hidden_size]} corresponding to the final hidden layer of the transformer
50 | encoder.}
51 | \item{pooled_output}{The dense layer on top of the hidden layer for the first
52 | token.}
53 | }
54 | }
55 | \examples{
56 | \dontrun{
57 | with(tensorflow::tf$variable_scope("examples",
58 |   reuse = tensorflow::tf$AUTO_REUSE
59 | ), {
60 |   input_ids <- tensorflow::tf$constant(list(
61 |     list(31L, 51L, 99L),
62 |     list(15L, 5L, 0L)
63 |   ))
64 | 
65 |   input_mask <- tensorflow::tf$constant(list(
66 |     list(1L, 1L, 1L),
67 |     list(1L, 1L, 0L)
68 |   ))
69 |   token_type_ids <- tensorflow::tf$constant(list(
70 |     list(0L, 0L, 1L),
71 |     list(0L, 2L, 0L)
72 |   ))
73 |   config <- BertConfig(
74 |     vocab_size = 32000L,
75 |     hidden_size = 768L,
76 |     num_hidden_layers = 8L,
77 |     num_attention_heads = 12L,
78 |     intermediate_size = 1024L
79 |   )
80 |   model <- BertModel(
81 |     config = config,
82 |     is_training = TRUE,
83 |     input_ids = input_ids,
84 |     input_mask = input_mask,
85 |     token_type_ids = token_type_ids
86 |   )
87 | })
88 | }
89 | }
90 | 


--------------------------------------------------------------------------------
/man/FullTokenizer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{FullTokenizer}
 4 | \alias{FullTokenizer}
 5 | \title{Construct objects of FullTokenizer class.}
 6 | \usage{
 7 | FullTokenizer(vocab_file, do_lower_case = TRUE)
 8 | }
 9 | \arguments{
10 | \item{vocab_file}{Path to text file containing list of vocabulary tokens.}
11 | 
12 | \item{do_lower_case}{Logical: do we convert everything to lowercase?}
13 | }
14 | \value{
15 | An object of class FullTokenizer.
16 | }
17 | \description{
18 | Construct objects of FullTokenizer class.
19 | }
20 | \examples{
21 | \dontrun{
22 | f_tokenizer <- FullTokenizer("vocab.txt", TRUE)
23 | }
24 | }
25 | 


--------------------------------------------------------------------------------
/man/InputExample.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_classifier.R
 3 | \name{InputExample}
 4 | \alias{InputExample}
 5 | \title{Construct objects of class \code{InputExample}}
 6 | \usage{
 7 | InputExample(guid, text_a, text_b = NULL, label = NULL)
 8 | }
 9 | \arguments{
10 | \item{guid}{Unique id for the example (character or integer?).}
11 | 
12 | \item{text_a}{Character; the untokenized text of the first sequence. For
13 | single sequence tasks, only this sequence must be specified.}
14 | 
15 | \item{text_b}{(Optional) Character; the untokenized text of the second
16 | sequence. Only must be specified for sequence pair tasks.}
17 | 
18 | \item{label}{(Optional) Character; the label of the example. This should be
19 | specified for train and dev examples, but not for test examples.}
20 | }
21 | \value{
22 | An object of class \code{InputExample}.
23 | }
24 | \description{
25 | An input example is a single training/test example for simple sequence
26 | classification.
27 | }
28 | \examples{
29 | \dontrun{
30 | input_ex <- InputExample(guid = 0, text_a = "Some text to classify.")
31 | }
32 | }
33 | 


--------------------------------------------------------------------------------
/man/InputExample_EF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extract_features.R
 3 | \name{InputExample_EF}
 4 | \alias{InputExample_EF}
 5 | \title{Construct objects of class \code{InputExample_EF}}
 6 | \usage{
 7 | InputExample_EF(unique_id, text_a, text_b = NULL)
 8 | }
 9 | \arguments{
10 | \item{unique_id}{Integer or character; a unique id for this example.}
11 | 
12 | \item{text_a}{Character; the untokenized text of the first sequence.}
13 | 
14 | \item{text_b}{(Optional) Character; the untokenized text of the second
15 | sequence.}
16 | }
17 | \value{
18 | An object of class \code{InputExample_EF}.
19 | }
20 | \description{
21 | An InputExample_EF is a single test example for feature extraction. Note that
22 | this class is similiar to the InputExample class used for simple sequence
23 | classification, but doesn't have a label property. The name of the id
24 | property is also annoyingly different; should eventually standardize better
25 | than the Python folks did. (RBERT issue #28.)
26 | }
27 | \examples{
28 | input_ex <- InputExample_EF(
29 |   unique_id = 1,
30 |   text_a = "I work at the bank."
31 | )
32 | }
33 | 


--------------------------------------------------------------------------------
/man/InputFeatures.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_classifier.R
 3 | \name{InputFeatures}
 4 | \alias{InputFeatures}
 5 | \title{Construct objects of class \code{InputFeatures}}
 6 | \usage{
 7 | InputFeatures(
 8 |   input_ids,
 9 |   input_mask,
10 |   segment_ids,
11 |   label_id,
12 |   is_real_example = TRUE
13 | )
14 | }
15 | \arguments{
16 | \item{input_ids}{Integer Tensor; the sequence of token ids in this example.}
17 | 
18 | \item{input_mask}{Integer Tensor; sequence of 1s (for "real" tokens) and 0s
19 | (for padding tokens).}
20 | 
21 | \item{segment_ids}{Integer Tensor; aka token_type_ids. Indicators for which
22 | sentence (or sequence each token belongs to). Classical BERT supports only
23 | 0s and 1s (for first and second sentence, respectively).}
24 | 
25 | \item{label_id}{Integer; represents training example classification labels.}
26 | 
27 | \item{is_real_example}{Logical; later on this is used as a flag for whether
28 | to "count" this example for calculating accuracy and loss.}
29 | }
30 | \value{
31 | An object of class \code{InputFeatures}.
32 | }
33 | \description{
34 | An InputFeatures object is a single set of features of data.
35 | }
36 | \examples{
37 | \dontrun{
38 | features <- InputFeatures(input_ids, input_mask, segment_ids, label_id)
39 | }
40 | }
41 | 


--------------------------------------------------------------------------------
/man/WordpieceTokenizer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{WordpieceTokenizer}
 4 | \alias{WordpieceTokenizer}
 5 | \title{Construct objects of WordpieceTokenizer class.}
 6 | \usage{
 7 | WordpieceTokenizer(vocab, unk_token = "[UNK]", max_input_chars_per_word = 200)
 8 | }
 9 | \arguments{
10 | \item{vocab}{Recognized vocabulary tokens, as a named integer vector. (Name
11 | is token, value is index.)}
12 | 
13 | \item{unk_token}{Token to use for unknown words.}
14 | 
15 | \item{max_input_chars_per_word}{Length of longest word we will recognize.}
16 | }
17 | \value{
18 | an object of class WordpieceTokenizer
19 | }
20 | \description{
21 | (I'm not sure that this object-based approach is best for R implementation,
22 | but for now just trying to reproduce python functionality.)
23 | }
24 | \details{
25 | Has method: tokenize.WordpieceTokenizer()
26 | }
27 | \examples{
28 | \dontrun{
29 | vocab <- load_vocab(vocab_file = "vocab.txt")
30 | wp_tokenizer <- WordpieceTokenizer(vocab)
31 | }
32 | }
33 | 


--------------------------------------------------------------------------------
/man/apply_to_chars.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{apply_to_chars}
 4 | \alias{apply_to_chars}
 5 | \title{Apply a function to each character in a string.}
 6 | \usage{
 7 | apply_to_chars(text, .f, ...)
 8 | }
 9 | \arguments{
10 | \item{text}{A character scalar to process.}
11 | 
12 | \item{.f}{The function to apply to each character. Should return a character
13 | scalar, given a single-character input.}
14 | 
15 | \item{...}{Other arguments to pass to .f.}
16 | }
17 | \value{
18 | The character scalar obtained by applying the given function to
19 | each character of the input string, and concatenating the results.
20 | }
21 | \description{
22 | Utility function for something done a lot in this package.
23 | }
24 | \keyword{internal}
25 | 


--------------------------------------------------------------------------------
/man/assert_rank.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{assert_rank}
 4 | \alias{assert_rank}
 5 | \title{Confirm the rank of a tensor}
 6 | \usage{
 7 | assert_rank(tensor, expected_rank, name = NULL)
 8 | }
 9 | \arguments{
10 | \item{tensor}{A tf.Tensor to check the rank of.}
11 | 
12 | \item{expected_rank}{Integer vector or list of integers, expected rank.}
13 | 
14 | \item{name}{Optional name of the tensor for the error message.}
15 | }
16 | \value{
17 | TRUE if the Tensor is of the expected rank (error otherwise).
18 | }
19 | \description{
20 | Throws an error if the tensor rank is not of the expected rank.
21 | }
22 | \examples{
23 | \dontrun{
24 | with(tensorflow::tf$variable_scope("examples",
25 |   reuse = tensorflow::tf$AUTO_REUSE
26 | ), {
27 |   ids <- tensorflow::tf$get_variable("x", dtype = "int32", shape = 10L)
28 |   assert_rank(ids, 1)
29 |   assert_rank(ids, 1:2)
30 |   assert_rank(ids, 2)
31 | })
32 | }
33 | }
34 | 


--------------------------------------------------------------------------------
/man/attention_layer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{attention_layer}
 4 | \alias{attention_layer}
 5 | \title{Build multi-headed attention layer}
 6 | \usage{
 7 | attention_layer(
 8 |   from_tensor,
 9 |   to_tensor,
10 |   attention_mask = NULL,
11 |   num_attention_heads = 1L,
12 |   size_per_head = 512L,
13 |   query_act = NULL,
14 |   key_act = NULL,
15 |   value_act = NULL,
16 |   attention_probs_dropout_prob = 0,
17 |   initializer_range = 0.02,
18 |   do_return_2d_tensor = FALSE,
19 |   batch_size = NULL,
20 |   from_seq_length = NULL,
21 |   to_seq_length = NULL
22 | )
23 | }
24 | \arguments{
25 | \item{from_tensor}{Float Tensor of shape \code{[batch_size, from_seq_length,
26 | from_width]}.}
27 | 
28 | \item{to_tensor}{Float Tensor of shape \code{[batch_size, to_seq_length,
29 | to_width]}.}
30 | 
31 | \item{attention_mask}{(optional) Integer Tensor of shape \code{[batch_size,
32 | from_seq_length, to_seq_length]}. The values should be 1 or 0. The
33 | attention scores will effectively be set to -infinity for any positions in
34 | the mask that are 0, and will be unchanged for positions that are 1.}
35 | 
36 | \item{num_attention_heads}{Integer; number of attention heads.}
37 | 
38 | \item{size_per_head}{Integer; size of each attention head.}
39 | 
40 | \item{query_act}{(Optional) Activation function for the query transform.}
41 | 
42 | \item{key_act}{(Optional) Activation function for the key transform.}
43 | 
44 | \item{value_act}{(Optional) Activation function for the value transform.}
45 | 
46 | \item{attention_probs_dropout_prob}{(Optional) Numeric; dropout probability
47 | of the attention probabilities.}
48 | 
49 | \item{initializer_range}{Numeric; range of the weight initializer.}
50 | 
51 | \item{do_return_2d_tensor}{Logical. If TRUE, the output will be of shape
52 | \code{[batch_size * from_seq_length, num_attention_heads * size_per_head]}.
53 | If false, the output will be of shape \code{[batch_size, from_seq_length,
54 | num_attention_heads * size_per_head]}.}
55 | 
56 | \item{batch_size}{(Optional) Integer; if the input is 2D, this might (sic) be
57 | the batch size of the 3D version of the \code{from_tensor} and
58 | \code{to_tensor}.}
59 | 
60 | \item{from_seq_length}{(Optional) Integer; if the input is 2D, this might be
61 | the seq length of the 3D version of the \code{from_tensor}.}
62 | 
63 | \item{to_seq_length}{(Optional) Integer; if the input is 2D, this might be
64 | the seq length of the 3D version of the \code{to_tensor}.}
65 | }
66 | \value{
67 | float Tensor of shape \code{[batch_size, from_seq_length,
68 |   num_attention_heads * size_per_head]}. If \code{do_return_2d_tensor} is
69 |   TRUE, it will be flattened to shape \code{[batch_size * from_seq_length,
70 |   num_attention_heads * size_per_head]}.
71 | }
72 | \description{
73 | Performs multi-headed attention from \code{from_tensor} to \code{to_tensor}.
74 | This is an implementation of multi-headed attention based on "Attention is
75 | all you Need". If \code{from_tensor} and \code{to_tensor} are the same, then
76 | this is self-attention. Each timestep in \code{from_tensor} attends to the
77 | corresponding sequence in \code{to_tensor}, and returns a fixed-with vector.
78 | This function first projects \code{from_tensor} into a "query" tensor and
79 | \code{to_tensor} into "key" and "value" tensors. These are (effectively) a
80 | list of tensors of length \code{num_attention_heads}, where each tensor is of
81 | shape \code{[batch_size, seq_length, size_per_head]}. Then, the query and key
82 | tensors are dot-producted and scaled. These are softmaxed to obtain attention
83 | probabilities. The value tensors are then interpolated by these
84 | probabilities, then concatenated back to a single tensor and returned.
85 | }
86 | \details{
87 | In practice, the multi-headed attention are done with transposes and reshapes
88 | rather than actual separate tensors.
89 | }
90 | \examples{
91 | \dontrun{
92 | # Maybe add examples later. For now, this is only called from
93 | # within transformer_model(), so refer to that function.
94 | }
95 | }
96 | 


--------------------------------------------------------------------------------
/man/bert_config_from_json_file.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{bert_config_from_json_file}
 4 | \alias{bert_config_from_json_file}
 5 | \title{Load BERT config object from json file}
 6 | \usage{
 7 | bert_config_from_json_file(json_file)
 8 | }
 9 | \arguments{
10 | \item{json_file}{Character; the path to a json config file.}
11 | }
12 | \value{
13 | An object of class BertConfig
14 | }
15 | \description{
16 | Given a path to a json config file, construct a BertConfig object with
17 | appropriate values.
18 | }
19 | \examples{
20 | \dontrun{
21 | temp_dir <- tempdir()
22 | json_file <- file.path(
23 |   temp_dir,
24 |   "BERT_checkpoints",
25 |   "uncased_L-12_H-768_A-12",
26 |   "bert_config.json"
27 | )
28 | bert_config_from_json_file(json_file)
29 | }
30 | }
31 | 


--------------------------------------------------------------------------------
/man/check_vocab.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{check_vocab}
 4 | \alias{check_vocab}
 5 | \title{Check Vocabulary}
 6 | \usage{
 7 | check_vocab(words, ckpt_dir = NULL, vocab_file = find_vocab(ckpt_dir))
 8 | }
 9 | \arguments{
10 | \item{words}{Character vector; words to check.}
11 | 
12 | \item{ckpt_dir}{Character; path to checkpoint directory. If specified, any
13 | other checkpoint files required by this function (\code{vocab_file},
14 | \code{bert_config_file}, or \code{init_checkpoint}) will default to
15 | standard filenames within \code{ckpt_dir}.}
16 | 
17 | \item{vocab_file}{path to vocabulary file. File is assumed to be a text file,
18 | with one token per line, with the line number corresponding to the index of
19 | that token in the vocabulary.}
20 | }
21 | \value{
22 | A logical vector containing \code{TRUE} if the corresponding word was
23 |   found verbatim in the vocabulary, \code{FALSE} otherwise.
24 | }
25 | \description{
26 | Given some words and a word piece vocabulary, checks to see if the words are
27 | in the vocabulary.
28 | }
29 | \examples{
30 | \dontrun{
31 | BERT_PRETRAINED_DIR <- download_BERT_checkpoint("bert_base_uncased")
32 | to_check <- c("apple", "appl")
33 | check_vocab(words = to_check, ckpt_dir = BERT_PRETRAINED_DIR) # TRUE, FALSE
34 | #'
35 | }
36 | }
37 | 


--------------------------------------------------------------------------------
/man/clean_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{clean_text}
 4 | \alias{clean_text}
 5 | \title{Perform invalid character removal and whitespace cleanup on text.}
 6 | \usage{
 7 | clean_text(text)
 8 | }
 9 | \arguments{
10 | \item{text}{A character scalar.}
11 | }
12 | \value{
13 | Cleaned up text.
14 | }
15 | \description{
16 | (R implementation of BasicTokenizer._clean_text from
17 | BERT: tokenization.py.)
18 | }
19 | \keyword{internal}
20 | 


--------------------------------------------------------------------------------
/man/convert_by_vocab.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{convert_by_vocab}
 4 | \alias{convert_by_vocab}
 5 | \alias{convert_tokens_to_ids}
 6 | \alias{convert_ids_to_tokens}
 7 | \title{Convert a sequence of tokens/ids using the provided vocab.}
 8 | \usage{
 9 | convert_by_vocab(vocab, items)
10 | 
11 | convert_tokens_to_ids(vocab, tokens)
12 | 
13 | convert_ids_to_tokens(inv_vocab, ids)
14 | }
15 | \arguments{
16 | \item{vocab}{Vocabulary; provides mapping from index to tokens. (This may
17 | be in fact an "inverse vocabulary", where the names are the indices and
18 | the values are the tokens.)}
19 | 
20 | \item{items}{Vector of the keys (names in the vocab vector) to "convert".}
21 | 
22 | \item{tokens}{Equivalent to items.}
23 | 
24 | \item{inv_vocab}{Equivalent to vocab.}
25 | 
26 | \item{ids}{Equivalent to items.}
27 | }
28 | \value{
29 | Vector of the values in `vocab` corresponding to `items`.
30 | (The names on the returned vector are kept.)
31 | }
32 | \description{
33 | Convert a sequence of tokens/ids using the provided vocab.
34 | }
35 | \section{Functions}{
36 | \itemize{
37 | \item \code{convert_tokens_to_ids}: Wrapper function for specifically converting
38 | tokens to ids.
39 | 
40 | \item \code{convert_ids_to_tokens}: Wrapper function for specifically converting
41 | ids to tokens.
42 | }}
43 | 
44 | \examples{
45 | convert_by_vocab(c("token1" = 0, "token2" = 1), "token1")
46 | }
47 | 


--------------------------------------------------------------------------------
/man/convert_examples_to_features.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_classifier.R
 3 | \name{convert_examples_to_features}
 4 | \alias{convert_examples_to_features}
 5 | \title{Convert \code{InputExample}s to \code{InputFeatures}}
 6 | \usage{
 7 | convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)
 8 | }
 9 | \arguments{
10 | \item{examples}{List of \code{InputExample}s to convert.}
11 | 
12 | \item{label_list}{Character (or integer?); possible labels for examples.}
13 | 
14 | \item{max_seq_length}{Integer; the maximum number of tokens that will be
15 | considered together.}
16 | 
17 | \item{tokenizer}{A tokenizer object to use (e.g. object of class
18 | FullTokenizer).}
19 | }
20 | \value{
21 | A list of \code{InputFeatures}.
22 | }
23 | \description{
24 | Converts a set of \code{InputExample}s to a list of \code{InputFeatures}.
25 | }
26 | \examples{
27 | \dontrun{
28 | tokenizer <- FullTokenizer("vocab.txt")
29 | input_ex1 <- InputExample(
30 |   guid = 1L,
31 |   text_a = "Some text to classify.",
32 |   text_b = "More wordy words.",
33 |   label = "good"
34 | )
35 | input_ex2 <- InputExample(
36 |   guid = 2L,
37 |   text_a = "This is another example.",
38 |   text_b = "So many words.",
39 |   label = "bad"
40 | )
41 | feat <- convert_examples_to_features(
42 |   examples = list(input_ex1, input_ex2),
43 |   label_list = c("good", "bad"),
44 |   max_seq_length = 15L,
45 |   tokenizer = tokenizer
46 | )
47 | }
48 | }
49 | 


--------------------------------------------------------------------------------
/man/convert_single_example.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_classifier.R
 3 | \name{convert_single_example}
 4 | \alias{convert_single_example}
 5 | \title{Convert a single \code{InputExample} into a single \code{InputFeatures}}
 6 | \usage{
 7 | convert_single_example(
 8 |   ex_index,
 9 |   example,
10 |   label_list,
11 |   max_seq_length,
12 |   tokenizer
13 | )
14 | }
15 | \arguments{
16 | \item{ex_index}{Integer; the index of this example. This is used to determine
17 | whether or not to print out some log info (for debugging or runtime
18 | confirmation). It is assumed this starts with 1 (in R).}
19 | 
20 | \item{example}{The \code{InputExample} to convert.}
21 | 
22 | \item{label_list}{Character (or integer); allowed labels for these examples.}
23 | 
24 | \item{max_seq_length}{Integer; the maximum number of tokens that will be
25 | considered together.}
26 | 
27 | \item{tokenizer}{A tokenizer object to use (e.g. object of class
28 | FullTokenizer).}
29 | }
30 | \value{
31 | An object of class \code{InputFeatures}.
32 | }
33 | \description{
34 | Converts a single \code{InputExample} into a single \code{InputFeatures}.
35 | }
36 | \examples{
37 | \dontrun{
38 | tokenizer <- FullTokenizer("vocab.txt")
39 | input_ex <- InputExample(
40 |   guid = 1L,
41 |   text_a = "Some text to classify.",
42 |   text_b = "More wordy words.",
43 |   label = "good"
44 | )
45 | feat <- convert_single_example(
46 |   ex_index = 1L,
47 |   example = input_ex,
48 |   label_list = c("good", "bad"),
49 |   max_seq_length = 15L,
50 |   tokenizer = tokenizer
51 | )
52 | }
53 | }
54 | 


--------------------------------------------------------------------------------
/man/convert_to_unicode.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{convert_to_unicode}
 4 | \alias{convert_to_unicode}
 5 | \title{Convert `text` to Unicode}
 6 | \usage{
 7 | convert_to_unicode(text)
 8 | }
 9 | \arguments{
10 | \item{text}{character scalar to convert to unicode}
11 | }
12 | \value{
13 | input text, converted to unicode if applicable
14 | }
15 | \description{
16 | See documentation for `Encoding` for more information.
17 | Assumes utf-8 input.
18 | }
19 | \examples{
20 | convert_to_unicode("fa\xC3\xA7ile")
21 | }
22 | 


--------------------------------------------------------------------------------
/man/create_attention_mask_from_input_mask.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{create_attention_mask_from_input_mask}
 4 | \alias{create_attention_mask_from_input_mask}
 5 | \title{Create 3D attention mask from a 2D tensor mask}
 6 | \usage{
 7 | create_attention_mask_from_input_mask(from_tensor, to_mask)
 8 | }
 9 | \arguments{
10 | \item{from_tensor}{2D or 3D Tensor of shape [batch_size, from_seq_length,
11 | ...].}
12 | 
13 | \item{to_mask}{int32 Tensor of shape [batch_size, to_seq_length].}
14 | }
15 | \value{
16 | float Tensor of shape [batch_size, from_seq_length, to_seq_length].
17 | }
18 | \description{
19 | An attention mask is used to zero out specific elements of an attention
20 | matrix. (For example, to prevent the model from "paying attention to the
21 | answer" in certain training tasks.)
22 | }
23 | \examples{
24 | \dontrun{
25 | with(tensorflow::tf$variable_scope("examples",
26 |   reuse = tensorflow::tf$AUTO_REUSE
27 | ), {
28 |   from_tensor <- ids <- tensorflow::tf$get_variable("ften",
29 |     dtype = "float", shape = c(10, 20)
30 |   )
31 |   to_mask <- ids <- tensorflow::tf$get_variable("mask",
32 |     dtype = "int32", shape = c(10, 30)
33 |   )
34 | })
35 | create_attention_mask_from_input_mask(from_tensor, to_mask)
36 | }
37 | }
38 | 


--------------------------------------------------------------------------------
/man/create_initializer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{create_initializer}
 4 | \alias{create_initializer}
 5 | \title{Create truncated normal initializer}
 6 | \usage{
 7 | create_initializer(initializer_range = 0.02)
 8 | }
 9 | \arguments{
10 | \item{initializer_range}{A double describing the range for the initializer
11 | (passed to the stddev parameter).}
12 | }
13 | \value{
14 | A tensorflow initializer.
15 | }
16 | \description{
17 | This is a wrapper around the tensorflow truncated_normal_initializer
18 | function.
19 | }
20 | \examples{
21 | \dontrun{
22 | create_initializer(0.02)
23 | }
24 | }
25 | 


--------------------------------------------------------------------------------
/man/create_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_classifier.R
 3 | \name{create_model}
 4 | \alias{create_model}
 5 | \title{Create a classification model}
 6 | \usage{
 7 | create_model(
 8 |   bert_config,
 9 |   is_training,
10 |   input_ids,
11 |   input_mask,
12 |   segment_ids,
13 |   labels,
14 |   num_labels
15 | )
16 | }
17 | \arguments{
18 | \item{bert_config}{\code{BertConfig} instance.}
19 | 
20 | \item{is_training}{Logical; TRUE for training model, FALSE for eval model.
21 | Controls whether dropout will be applied.}
22 | 
23 | \item{input_ids}{Integer Tensor of shape \code{[batch_size, seq_length]}.}
24 | 
25 | \item{input_mask}{Integer Tensor of shape \code{[batch_size, seq_length]}.}
26 | 
27 | \item{segment_ids}{Integer Tensor of shape \code{[batch_size, seq_length]}.}
28 | 
29 | \item{labels}{Integer Tensor; represents training example classification
30 | labels. Length = batch size.}
31 | 
32 | \item{num_labels}{Integer; number of classification labels.}
33 | }
34 | \value{
35 | A list including the loss (for training) and the model output
36 |   (softmax probabilities, log probs).
37 | }
38 | \description{
39 | Takes the output layer from a BERT "spine" and appends a classifier layer to
40 | it. The output taken from BERT is the pooled first token layers (may want to
41 | modify the code to use token-level outputs). The classifier is essentially a
42 | single dense layer with softmax.
43 | }
44 | \examples{
45 | \dontrun{
46 | with(tensorflow::tf$variable_scope("examples",
47 |   reuse = tensorflow::tf$AUTO_REUSE
48 | ), {
49 |   input_ids <- tensorflow::tf$constant(list(
50 |     list(31L, 51L, 99L),
51 |     list(15L, 5L, 0L)
52 |   ))
53 | 
54 |   input_mask <- tensorflow::tf$constant(list(
55 |     list(1L, 1L, 1L),
56 |     list(1L, 1L, 0L)
57 |   ))
58 |   token_type_ids <- tensorflow::tf$constant(list(
59 |     list(0L, 0L, 1L),
60 |     list(0L, 2L, 0L)
61 |   ))
62 |   config <- BertConfig(
63 |     vocab_size = 32000L,
64 |     hidden_size = 768L,
65 |     num_hidden_layers = 8L,
66 |     num_attention_heads = 12L,
67 |     intermediate_size = 1024L
68 |   )
69 |   class_model <- create_model(
70 |     bert_config = config,
71 |     is_training = TRUE,
72 |     input_ids = input_ids,
73 |     input_mask = input_mask,
74 |     segment_ids = token_type_ids,
75 |     labels = c(1L, 2L),
76 |     num_labels = 2L,
77 |   )
78 | })
79 | }
80 | }
81 | 


--------------------------------------------------------------------------------
/man/create_optimizer.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/optimization.R
 3 | \name{create_optimizer}
 4 | \alias{create_optimizer}
 5 | \title{Create an optimizer training op}
 6 | \usage{
 7 | create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu)
 8 | }
 9 | \arguments{
10 | \item{loss}{Float Tensor; the loss for this step (calculated elsewhere; in
11 | principle is a function of trainable parameter values).}
12 | 
13 | \item{init_lr}{Numeric; initial learning rate.}
14 | 
15 | \item{num_train_steps}{Integer; number of steps to train for.}
16 | 
17 | \item{num_warmup_steps}{Integer; number of steps to use for "warm-up".}
18 | 
19 | \item{use_tpu}{Logical; whether to use TPU.}
20 | }
21 | \value{
22 | A training op: the result of a tensorflow group() of operations.
23 | }
24 | \description{
25 | \code{create_optimizer} doesn't actually return the optimizer object; it
26 | returns the operation resulting from a tf.group() call.
27 | }
28 | \details{
29 | See also:
30 | 
31 | \url{https://www.tensorflow.org/api_docs/python/tf/group}
32 | 
33 | \url{https://stackoverflow.com/questions/41780655/what-is-the-difference-between-tf-group-and-tf-control-dependencies}
34 | 
35 | The routine tf.gradients() is called in the course of this function.
36 | \url{https://www.tensorflow.org/api_docs/python/tf/gradients}
37 | }
38 | \examples{
39 | \dontrun{
40 | with(tensorflow::tf$variable_scope("examples",
41 |   reuse = tensorflow::tf$AUTO_REUSE
42 | ), {
43 |   totrain <- tensorflow::tf$get_variable(
44 |     "totrain",
45 |     tensorflow::shape(10L, 20L)
46 |   )
47 |   loss <- 2 * totrain
48 | 
49 |   train_op <- create_optimizer(
50 |     loss = loss,
51 |     init_lr = 0.01,
52 |     num_train_steps = 20L,
53 |     num_warmup_steps = 10L,
54 |     use_tpu = FALSE
55 |   )
56 | })
57 | }
58 | }
59 | 


--------------------------------------------------------------------------------
/man/dot-InputFeatures_EF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extract_features.R
 3 | \name{.InputFeatures_EF}
 4 | \alias{.InputFeatures_EF}
 5 | \title{Construct objects of class \code{InputFeatures_FE}}
 6 | \usage{
 7 | .InputFeatures_EF(unique_id, tokens, input_ids, input_mask, input_type_ids)
 8 | }
 9 | \arguments{
10 | \item{unique_id}{Integer or character; a unique id for this example.}
11 | 
12 | \item{tokens}{Character vector; the actual tokens in this example.}
13 | 
14 | \item{input_ids}{Integer vector; the sequence of token ids in this example.}
15 | 
16 | \item{input_mask}{Integer vector; sequence of 1s (for "real" tokens) and 0s
17 | (for padding tokens).}
18 | 
19 | \item{input_type_ids}{Integer vector; aka token_type_ids. Indicators for
20 | which sentence (or sequence) each token belongs to. Classical BERT supports
21 | only 0s and 1s (for first and second sentence, respectively).}
22 | }
23 | \value{
24 | An object of class \code{InputFeatures_FE}.
25 | }
26 | \description{
27 | An InputFeatures object is a single set of (input) features of data used for
28 | (output) feature extraction. Note that this class is similiar to the
29 | InputFeatures class used for simple sequence classification, with annoying
30 | differences. Will eventually standardize; till then, check parameter names.
31 | (RBERT issue #28.)
32 | }
33 | \keyword{internal}
34 | 


--------------------------------------------------------------------------------
/man/dot-choose_BERT_dir.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.choose_BERT_dir}
 4 | \alias{.choose_BERT_dir}
 5 | \title{Choose a directory for BERT checkpoints}
 6 | \usage{
 7 | .choose_BERT_dir(dir)
 8 | }
 9 | \arguments{
10 | \item{dir}{Character vector. Destination directory for checkpoints. Leave
11 | \code{NULL} to allow RBERT to automatically choose a directory. The path is
12 | determined from the \code{dir} parameter if supplied, followed by the
13 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT"
14 | folder in the user cache directory (determined using
15 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the
16 | `RBERT.dir` option will be updated to that location. Note that the
17 | checkpoint will create a subdirectory inside this \code{dir}.}
18 | }
19 | \value{
20 | A character vector indicating a directory in which BERT checkpoints
21 |   are stored.
22 | }
23 | \description{
24 | If \code{dir} is not NULL, this function simply returns \code{dir}. Otherwise
25 | it checks the `RBERT.dir` param, and then uses
26 | \code{\link[rappdirs]{user_cache_dir}} to choose a directory if necessary.
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/dot-convert_examples_to_features_EF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extract_features.R
 3 | \name{.convert_examples_to_features_EF}
 4 | \alias{.convert_examples_to_features_EF}
 5 | \title{Convert \code{InputExample_EF}s to \code{InputFeatures_EF}}
 6 | \usage{
 7 | .convert_examples_to_features_EF(examples, seq_length, tokenizer)
 8 | }
 9 | \arguments{
10 | \item{examples}{List of \code{InputExample_EF}s to convert.}
11 | 
12 | \item{seq_length}{Integer; the maximum number of tokens that will be
13 | considered together.}
14 | 
15 | \item{tokenizer}{A tokenizer object to use (e.g. object of class
16 | FullTokenizer).}
17 | }
18 | \value{
19 | A list of \code{InputFeatures}.
20 | }
21 | \description{
22 | Converts a set of \code{InputExample_EF}s to a list of
23 | \code{InputFeatures_EF}. Very similar to \code{convert_examples_to_features}
24 | from run_classifier.R. (RBERT issue #28.)
25 | }
26 | \keyword{internal}
27 | 


--------------------------------------------------------------------------------
/man/dot-convert_single_example_EF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extract_features.R
 3 | \name{.convert_single_example_EF}
 4 | \alias{.convert_single_example_EF}
 5 | \title{Convert a single \code{InputExample_EF} into a single \code{InputFeatures_EF}}
 6 | \usage{
 7 | .convert_single_example_EF(ex_index, example, seq_length, tokenizer)
 8 | }
 9 | \arguments{
10 | \item{ex_index}{Integer; the index of this example. This is used to determine
11 | whether or not to print out some log info (for debugging or runtime
12 | confirmation). It is assumed this starts with 1 (in R).}
13 | 
14 | \item{example}{The \code{InputExample_EF} to convert.}
15 | 
16 | \item{seq_length}{Integer; the maximum number of tokens that will be
17 | considered together.}
18 | 
19 | \item{tokenizer}{A tokenizer object to use (e.g. object of class
20 | FullTokenizer).}
21 | }
22 | \value{
23 | An object of class \code{InputFeatures_EF}.
24 | }
25 | \description{
26 | Converts a single \code{InputExample_EF} into a single
27 | \code{InputFeatures_EF}. Very similar to \code{convert_single_example} from
28 | run_classifier.R. (RBERT issue #28.)
29 | }
30 | \keyword{internal}
31 | 


--------------------------------------------------------------------------------
/man/dot-download_BERT_checkpoint.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.download_BERT_checkpoint}
 4 | \alias{.download_BERT_checkpoint}
 5 | \title{Download a checkpoint zip file}
 6 | \usage{
 7 | .download_BERT_checkpoint(url, checkpoint_zip_path)
 8 | }
 9 | \arguments{
10 | \item{url}{Character vector. An optional url from which to download a
11 | checkpoint. Overrides \code{model} parameter if not NULL.}
12 | 
13 | \item{checkpoint_zip_path}{The path to which the checkpoint zip should be
14 | downloaded.}
15 | }
16 | \value{
17 | \code{TRUE} invisibly.
18 | }
19 | \description{
20 | Download a checkpoint zip file
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/dot-get_actual_index.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extract_features.R
 3 | \name{.get_actual_index}
 4 | \alias{.get_actual_index}
 5 | \title{Standardize Indices}
 6 | \usage{
 7 | .get_actual_index(index, length)
 8 | }
 9 | \arguments{
10 | \item{index}{Integer; the index to normalize.}
11 | 
12 | \item{length}{Integer; the length of the vector or list we are indexing.}
13 | }
14 | \value{
15 | The "actual" integer index, between 1 and \code{length}, inclusive.
16 | }
17 | \description{
18 | Convert negative indices to positive ones. Use the convention that
19 | \code{vec[[-1L]]} signifies the last element of \code{vec}, \code{vec[[-2L]]}
20 | signifies the second-to-last element of \code{vec}, and so on. 1-based
21 | indexing is assumed. Values of zero, or out-of-range indices, will be
22 | rejected.
23 | }
24 | \keyword{internal}
25 | 


--------------------------------------------------------------------------------
/man/dot-get_model_archive_path.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.get_model_archive_path}
 4 | \alias{.get_model_archive_path}
 5 | \title{Locate an archive file for a BERT checkpoint}
 6 | \usage{
 7 | .get_model_archive_path(model, dir, archive_type)
 8 | }
 9 | \arguments{
10 | \item{model}{Character vector. Which model checkpoint to download.}
11 | 
12 | \item{dir}{Character vector. Destination directory for checkpoints. Leave
13 | \code{NULL} to allow RBERT to automatically choose a directory. The path is
14 | determined from the \code{dir} parameter if supplied, followed by the
15 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT"
16 | folder in the user cache directory (determined using
17 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the
18 | `RBERT.dir` option will be updated to that location. Note that the
19 | checkpoint will create a subdirectory inside this \code{dir}.}
20 | 
21 | \item{archive_type}{How is the checkpoint archived? We currently support
22 | "zip" and "tar-gzip". Leave NULL to infer from the \code{url}.}
23 | }
24 | \value{
25 | The path to the archive file where the raw checkpoint should be
26 |   saved.
27 | }
28 | \description{
29 | Locate an archive file for a BERT checkpoint
30 | }
31 | \keyword{internal}
32 | 


--------------------------------------------------------------------------------
/man/dot-get_model_archive_type.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.get_model_archive_type}
 4 | \alias{.get_model_archive_type}
 5 | \title{Get archive type of a BERT checkpoint}
 6 | \usage{
 7 | .get_model_archive_type(model)
 8 | }
 9 | \arguments{
10 | \item{model}{Character vector. Which model checkpoint to download.}
11 | }
12 | \value{
13 | The archive type to the specified BERT model.
14 | }
15 | \description{
16 | Returns the archive type ("zip" or "tar-gzip") of the specified BERT
17 | checkpoint from the Google Research collection or other repository.
18 | }
19 | \keyword{internal}
20 | 


--------------------------------------------------------------------------------
/man/dot-get_model_subdir.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.get_model_subdir}
 4 | \alias{.get_model_subdir}
 5 | \title{Locate a subdir for a BERT checkpoint}
 6 | \usage{
 7 | .get_model_subdir(model, dir)
 8 | }
 9 | \arguments{
10 | \item{model}{Character vector. Which model checkpoint to download.}
11 | 
12 | \item{dir}{Character vector. Destination directory for checkpoints. Leave
13 | \code{NULL} to allow RBERT to automatically choose a directory. The path is
14 | determined from the \code{dir} parameter if supplied, followed by the
15 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT"
16 | folder in the user cache directory (determined using
17 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the
18 | `RBERT.dir` option will be updated to that location. Note that the
19 | checkpoint will create a subdirectory inside this \code{dir}.}
20 | }
21 | \value{
22 | The path to the sub-directory where the checkpoint should be saved.
23 | }
24 | \description{
25 | Locate a subdir for a BERT checkpoint
26 | }
27 | \keyword{internal}
28 | 


--------------------------------------------------------------------------------
/man/dot-get_model_url.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.get_model_url}
 4 | \alias{.get_model_url}
 5 | \title{Get url of a BERT checkpoint}
 6 | \usage{
 7 | .get_model_url(model)
 8 | }
 9 | \arguments{
10 | \item{model}{Character vector. Which model checkpoint to download.}
11 | }
12 | \value{
13 | The url to the specified BERT model.
14 | }
15 | \description{
16 | Returns the url of the specified BERT checkpoint from the Google Research
17 | collection or other repository.
18 | }
19 | \keyword{internal}
20 | 


--------------------------------------------------------------------------------
/man/dot-has_checkpoint.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.has_checkpoint}
 4 | \alias{.has_checkpoint}
 5 | \title{Check whether the user already has a checkpoint}
 6 | \usage{
 7 | .has_checkpoint(model = NULL, dir = NULL, ckpt_dir = NULL)
 8 | }
 9 | \arguments{
10 | \item{model}{Character vector. Which model checkpoint to download.}
11 | 
12 | \item{dir}{Character vector. Destination directory for checkpoints. Leave
13 | \code{NULL} to allow RBERT to automatically choose a directory. The path is
14 | determined from the \code{dir} parameter if supplied, followed by the
15 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT"
16 | folder in the user cache directory (determined using
17 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the
18 | `RBERT.dir` option will be updated to that location. Note that the
19 | checkpoint will create a subdirectory inside this \code{dir}.}
20 | 
21 | \item{ckpt_dir}{The path to the subdir where this checkpoint should
22 | be saved. If model is given, ckpt_dir is inferred.}
23 | }
24 | \value{
25 | A logical indicating whether the user already has that checkpoint in
26 |   that location.
27 | }
28 | \description{
29 | Check the specified dir (or the default dir if none is specified) for a given
30 | model or url.
31 | }
32 | \keyword{internal}
33 | 


--------------------------------------------------------------------------------
/man/dot-infer_archive_type.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.infer_archive_type}
 4 | \alias{.infer_archive_type}
 5 | \title{Infer the archive type for a BERT checkpoint}
 6 | \usage{
 7 | .infer_archive_type(url)
 8 | }
 9 | \arguments{
10 | \item{url}{Character vector. An optional url from which to download a
11 | checkpoint. Overrides \code{model} parameter if not NULL.}
12 | }
13 | \value{
14 | A character vector, currently either "zip" or "tar-gzip".
15 | }
16 | \description{
17 | Infer the archive type for a BERT checkpoint
18 | }
19 | \keyword{internal}
20 | 


--------------------------------------------------------------------------------
/man/dot-infer_checkpoint_archive_path.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.infer_checkpoint_archive_path}
 4 | \alias{.infer_checkpoint_archive_path}
 5 | \title{Infer the path to the archive for a BERT checkpoint}
 6 | \usage{
 7 | .infer_checkpoint_archive_path(url, dir)
 8 | }
 9 | \arguments{
10 | \item{url}{Character vector. An optional url from which to download a
11 | checkpoint. Overrides \code{model} parameter if not NULL.}
12 | 
13 | \item{dir}{Character vector. Destination directory for checkpoints. Leave
14 | \code{NULL} to allow RBERT to automatically choose a directory. The path is
15 | determined from the \code{dir} parameter if supplied, followed by the
16 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT"
17 | folder in the user cache directory (determined using
18 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the
19 | `RBERT.dir` option will be updated to that location. Note that the
20 | checkpoint will create a subdirectory inside this \code{dir}.}
21 | }
22 | \value{
23 | A character vector file path, pointing to where the raw checkpoint
24 |   archive should be saved.
25 | }
26 | \description{
27 | Infer the path to the archive for a BERT checkpoint
28 | }
29 | \keyword{internal}
30 | 


--------------------------------------------------------------------------------
/man/dot-infer_ckpt_dir.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.infer_ckpt_dir}
 4 | \alias{.infer_ckpt_dir}
 5 | \title{Infer the subdir for a BERT checkpoint}
 6 | \usage{
 7 | .infer_ckpt_dir(url, dir)
 8 | }
 9 | \arguments{
10 | \item{url}{Character vector. An optional url from which to download a
11 | checkpoint. Overrides \code{model} parameter if not NULL.}
12 | 
13 | \item{dir}{Character vector. Destination directory for checkpoints. Leave
14 | \code{NULL} to allow RBERT to automatically choose a directory. The path is
15 | determined from the \code{dir} parameter if supplied, followed by the
16 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT"
17 | folder in the user cache directory (determined using
18 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the
19 | `RBERT.dir` option will be updated to that location. Note that the
20 | checkpoint will create a subdirectory inside this \code{dir}.}
21 | }
22 | \value{
23 | A character vector file path, reflecting the "name" part of a
24 |   checkpoint \code{url}, placed within \code{dir}.
25 | }
26 | \description{
27 | Infer the subdir for a BERT checkpoint
28 | }
29 | \keyword{internal}
30 | 


--------------------------------------------------------------------------------
/man/dot-infer_model_paths.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{.infer_model_paths}
 4 | \alias{.infer_model_paths}
 5 | \title{Find Paths to Checkpoint Files}
 6 | \usage{
 7 | .infer_model_paths(
 8 |   model = c("bert_base_uncased", "bert_base_cased", "bert_large_uncased",
 9 |     "bert_large_cased", "bert_large_uncased_wwm", "bert_large_cased_wwm",
10 |     "bert_base_multilingual_cased", "bert_base_chinese", "scibert_scivocab_uncased",
11 |     "scibert_scivocab_cased", "scibert_basevocab_uncased", "scibert_basevocab_cased"),
12 |   ckpt_dir = NULL,
13 |   vocab_file = find_vocab(ckpt_dir),
14 |   bert_config_file = find_config(ckpt_dir),
15 |   init_checkpoint = find_ckpt(ckpt_dir)
16 | )
17 | }
18 | \arguments{
19 | \item{model}{Character; which model checkpoint to use. If specified,
20 | \code{ckpt_dir}, code{vocab_file}, \code{bert_config_file}, and
21 | \code{init_checkpoint} will be inferred. If you do not have this
22 | checkpoint, you will be prompted to download it in interactive mode.}
23 | 
24 | \item{ckpt_dir}{Character; path to checkpoint directory. If specified, any
25 | other checkpoint files required by this function (\code{vocab_file},
26 | \code{bert_config_file}, or \code{init_checkpoint}) will default to
27 | standard filenames within \code{ckpt_dir}.}
28 | 
29 | \item{vocab_file}{path to vocabulary file. File is assumed to be a text file,
30 | with one token per line, with the line number corresponding to the index of
31 | that token in the vocabulary.}
32 | 
33 | \item{bert_config_file}{Character; the path to a json config file.}
34 | 
35 | \item{init_checkpoint}{Character; path to the checkpoint directory, plus
36 | checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and
37 | explicit, starting with "/".}
38 | }
39 | \value{
40 | A list with components vocab_file, bert_config_file, and
41 |   init_checkpoint.
42 | }
43 | \description{
44 | In some functions, the user can specify a model, a ckpt_dir, and/or specific
45 | paths to checkpoint files. This function sorts all of that out.
46 | }
47 | \keyword{internal}
48 | 


--------------------------------------------------------------------------------
/man/dot-maybe_download_checkpoint.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.maybe_download_checkpoint}
 4 | \alias{.maybe_download_checkpoint}
 5 | \title{Find or Possibly Download a Checkpoint}
 6 | \usage{
 7 | .maybe_download_checkpoint(
 8 |   model = c("bert_base_uncased", "bert_base_cased", "bert_large_uncased",
 9 |     "bert_large_cased", "bert_large_uncased_wwm", "bert_large_cased_wwm",
10 |     "bert_base_multilingual_cased", "bert_base_chinese", "scibert_scivocab_uncased",
11 |     "scibert_scivocab_cased", "scibert_basevocab_uncased", "scibert_basevocab_cased"),
12 |   dir = NULL,
13 |   ckpt_dir = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{model}{Character vector. Which model checkpoint to download.}
18 | 
19 | \item{dir}{Character vector. Destination directory for checkpoints. Leave
20 | \code{NULL} to allow RBERT to automatically choose a directory. The path is
21 | determined from the \code{dir} parameter if supplied, followed by the
22 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT"
23 | folder in the user cache directory (determined using
24 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the
25 | `RBERT.dir` option will be updated to that location. Note that the
26 | checkpoint will create a subdirectory inside this \code{dir}.}
27 | 
28 | \item{ckpt_dir}{The path to the subdir where this checkpoint should
29 | be saved. If model is given, ckpt_dir is inferred.}
30 | }
31 | \value{
32 | TRUE (invisibly)
33 | }
34 | \description{
35 | Verify that the user has a specified checkpoint, and prompt to download if
36 | they don't (in interactive mode).
37 | }
38 | \keyword{internal}
39 | 


--------------------------------------------------------------------------------
/man/dot-model_fn_builder_EF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extract_features.R
 3 | \name{.model_fn_builder_EF}
 4 | \alias{.model_fn_builder_EF}
 5 | \title{Define \code{model_fn} closure for \code{TPUEstimator}}
 6 | \usage{
 7 | .model_fn_builder_EF(bert_config, init_checkpoint, layer_indexes, use_tpu)
 8 | }
 9 | \arguments{
10 | \item{bert_config}{\code{BertConfig} instance.}
11 | 
12 | \item{init_checkpoint}{Character; path to the checkpoint directory, plus
13 | checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and
14 | explicit, starting with "/".}
15 | 
16 | \item{layer_indexes}{Integer list; indexes (positive, or negative counting
17 | back from the end) indicating which layers to extract as "output features".
18 | (It needs to be specified here because we get them back as the model
19 | "predictions".)}
20 | 
21 | \item{use_tpu}{Logical; whether to use TPU.}
22 | }
23 | \value{
24 | \code{model_fn} closure for \code{TPUEstimator}.
25 | }
26 | \description{
27 | Returns \code{model_fn} closure, which is an input to \code{TPUEstimator}.
28 | This function is similar to \code{model_fn_builder} from run_classifier.R.
29 | (RBERT issue #28.)
30 | }
31 | \details{
32 | The \code{model_fn} function takes four parameters: \describe{
33 | \item{features}{A list (or similar structure) that contains objects such as
34 | \code{input_ids}, \code{input_mask}, \code{tokens},  and
35 | \code{input_type_ids}. These objects will be inputs to the
36 | \code{create_model} function.} \item{labels}{Not used in this function, but
37 | presumably we need to keep this slot here.} \item{mode}{Character; value such
38 | as "train", "infer", or "eval".} \item{params}{Not used in this function, but
39 | presumably we need to keep this slot here.} }
40 | 
41 | The output of \code{model_fn} is the result of a
42 | \code{tf$contrib$tpu$TPUEstimatorSpec} call.
43 | }
44 | \keyword{internal}
45 | 


--------------------------------------------------------------------------------
/man/dot-process_BERT_checkpoint.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{.process_BERT_checkpoint}
 4 | \alias{.process_BERT_checkpoint}
 5 | \title{Unzip and check a BERT checkpoint zip}
 6 | \usage{
 7 | .process_BERT_checkpoint(dir, checkpoint_archive_path, ckpt_dir, archive_type)
 8 | }
 9 | \arguments{
10 | \item{dir}{Character vector. Destination directory for checkpoints. Leave
11 | \code{NULL} to allow RBERT to automatically choose a directory. The path is
12 | determined from the \code{dir} parameter if supplied, followed by the
13 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT"
14 | folder in the user cache directory (determined using
15 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the
16 | `RBERT.dir` option will be updated to that location. Note that the
17 | checkpoint will create a subdirectory inside this \code{dir}.}
18 | 
19 | \item{archive_type}{How is the checkpoint archived? We currently support
20 | "zip" and "tar-gzip". Leave NULL to infer from the \code{url}.}
21 | }
22 | \value{
23 | \code{TRUE} invisibly.
24 | }
25 | \description{
26 | Unzip and check a BERT checkpoint zip
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/download_BERT_checkpoint.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{download_BERT_checkpoint}
 4 | \alias{download_BERT_checkpoint}
 5 | \title{Download a BERT checkpoint}
 6 | \source{
 7 | \url{https://github.com/google-research/bert}
 8 | 
 9 | \url{https://github.com/allenai/scibert}
10 | }
11 | \usage{
12 | download_BERT_checkpoint(
13 |   model = c("bert_base_uncased", "bert_base_cased", "bert_large_uncased",
14 |     "bert_large_cased", "bert_large_uncased_wwm", "bert_large_cased_wwm",
15 |     "bert_base_multilingual_cased", "bert_base_chinese", "scibert_scivocab_uncased",
16 |     "scibert_scivocab_cased", "scibert_basevocab_uncased", "scibert_basevocab_cased"),
17 |   dir = NULL,
18 |   url = NULL,
19 |   force = FALSE,
20 |   keep_archive = FALSE,
21 |   archive_type = NULL
22 | )
23 | }
24 | \arguments{
25 | \item{model}{Character vector. Which model checkpoint to download.}
26 | 
27 | \item{dir}{Character vector. Destination directory for checkpoints. Leave
28 | \code{NULL} to allow RBERT to automatically choose a directory. The path is
29 | determined from the \code{dir} parameter if supplied, followed by the
30 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT"
31 | folder in the user cache directory (determined using
32 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the
33 | `RBERT.dir` option will be updated to that location. Note that the
34 | checkpoint will create a subdirectory inside this \code{dir}.}
35 | 
36 | \item{url}{Character vector. An optional url from which to download a
37 | checkpoint. Overrides \code{model} parameter if not NULL.}
38 | 
39 | \item{force}{Logical. Download even if the checkpoint already exists in the
40 | specified directory? Default \code{FALSE}.}
41 | 
42 | \item{keep_archive}{Logical. Keep the zip (or other archive) file? Leave as
43 | \code{FALSE} to save space.}
44 | 
45 | \item{archive_type}{How is the checkpoint archived? We currently support
46 | "zip" and "tar-gzip". Leave NULL to infer from the \code{url}.}
47 | }
48 | \value{
49 | If successful, returns the path to the downloaded checkpoint.
50 | }
51 | \description{
52 | Downloads the specified BERT checkpoint from the Google Research collection
53 | or other repositories.
54 | }
55 | \section{Checkpoints}{
56 |  \code{download_BERT_checkpoint} knows about several
57 |   pre-trained BERT checkpoints. You can specify these checkpoints using the
58 |   \code{model} parameter. Alternatively, you can supply a direct \code{url}
59 |   to any BERT tensorflow checkpoint.
60 | 
61 |   \tabular{rccccl}{ model \tab layers \tab hidden \tab heads \tab parameters
62 |   \tab special\cr bert_base_* \tab 12 \tab 768 \tab 12 \tab 110M\cr
63 |   bert_large_* \tab 24 \tab 1024 \tab 16 \tab 340M\cr bert_large_*_wwm \tab
64 |   24 \tab 1024 \tab 16 \tab 340M \tab whole word masking\cr
65 |   bert_base_multilingual_cased \tab 12 \tab 768 \tab 12 \tab 110M \tab 104
66 |   languages\cr bert_base_chinese \tab 12 \tab 768 \tab 12 \tab 110M \tab
67 |   Chinese Simplified and Traditional\cr scibert_scivocab_* \tab 12 \tab 768
68 |   \tab 12 \tab 110M \tab Trained using the full text of 1.14M scientific
69 |   papers (18\% computer science, 82\% biomedical), with a science-specific
70 |   vocabulary.\cr scibert_basevocab_uncased \tab 12 \tab 768 \tab 12 \tab 110M
71 |   \tab As scibert_scivocab_*, but using the original BERT vocabulary. }
72 | }
73 | 
74 | \examples{
75 | \dontrun{
76 | download_BERT_checkpoint("bert_base_uncased")
77 | download_BERT_checkpoint("bert_large_uncased")
78 | temp_dir <- tempdir()
79 | download_BERT_checkpoint("bert_base_uncased", dir = temp_dir)
80 | }
81 | }
82 | 


--------------------------------------------------------------------------------
/man/dropout.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{dropout}
 4 | \alias{dropout}
 5 | \title{Perform Dropout}
 6 | \usage{
 7 | dropout(input_tensor, dropout_prob = NULL)
 8 | }
 9 | \arguments{
10 | \item{input_tensor}{Float Tensor to perform dropout on.}
11 | 
12 | \item{dropout_prob}{A double giving the probability of dropping out a value
13 | (NOT of KEEPING a dimension as in `tf.nn.dropout`).}
14 | }
15 | \value{
16 | A version of `input_tensor` with dropout applied.
17 | }
18 | \description{
19 | Perform Dropout
20 | }
21 | \examples{
22 | \dontrun{
23 | tfx <- tensorflow::tf$get_variable("none", tensorflow::shape(10L))
24 | dropout(tfx, 0.5)
25 | }
26 | }
27 | 


--------------------------------------------------------------------------------
/man/embedding_lookup.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{embedding_lookup}
 4 | \alias{embedding_lookup}
 5 | \title{Look up words embeddings for id tensor}
 6 | \usage{
 7 | embedding_lookup(
 8 |   input_ids,
 9 |   vocab_size,
10 |   embedding_size = 128L,
11 |   initializer_range = 0.02,
12 |   word_embedding_name = "word_embeddings"
13 | )
14 | }
15 | \arguments{
16 | \item{input_ids}{Integer Tensor of shape [batch_size, seq_length] containing
17 | word ids.}
18 | 
19 | \item{vocab_size}{Size of the embedding vocabulary (integer).}
20 | 
21 | \item{embedding_size}{Width of the word embeddings (integer).}
22 | 
23 | \item{initializer_range}{Embedding initialization range (float).}
24 | 
25 | \item{word_embedding_name}{Name of the embedding table (character).}
26 | }
27 | \value{
28 | Float Tensor of shape [batch_size, seq_length, embedding_size], along
29 |   with the embedding table in a list.
30 | }
31 | \description{
32 | Look up words embeddings for id tensor
33 | }
34 | \examples{
35 | \dontrun{
36 | with(
37 |   tensorflow::tf$variable_scope("examples",
38 |     reuse = tensorflow::tf$AUTO_REUSE
39 |   ),
40 |   ids <- tensorflow::tf$get_variable("x",
41 |     dtype = "int32",
42 |     shape = tensorflow::shape(10, 20)
43 |   )
44 | )
45 | embedding_lookup(ids, vocab_size = 100, word_embedding_name = "some_name")
46 | }
47 | }
48 | 


--------------------------------------------------------------------------------
/man/embedding_postprocessor.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{embedding_postprocessor}
 4 | \alias{embedding_postprocessor}
 5 | \title{Perform various post-processing on a word embedding tensor}
 6 | \usage{
 7 | embedding_postprocessor(
 8 |   input_tensor,
 9 |   use_token_type = FALSE,
10 |   token_type_ids = NULL,
11 |   token_type_vocab_size = 16L,
12 |   token_type_embedding_name = "token_type_embeddings",
13 |   use_position_embeddings = TRUE,
14 |   position_embedding_name = "position_embeddings",
15 |   initializer_range = 0.02,
16 |   max_position_embeddings = 512L,
17 |   dropout_prob = 0.1
18 | )
19 | }
20 | \arguments{
21 | \item{input_tensor}{Float Tensor of shape \code{[batch_size, seq_length,
22 | embedding_size]}.}
23 | 
24 | \item{use_token_type}{Logical; whether to add embeddings for
25 | \code{token_type_ids}.}
26 | 
27 | \item{token_type_ids}{(optional) Integer Tensor of shape \code{[batch_size,
28 | seq_length]}. Must be specified if \code{use_token_type} is TRUE}
29 | 
30 | \item{token_type_vocab_size}{Integer; the vocabulary size of
31 | \code{token_type_ids}. This defaults to 16 (here and in BERT code), but
32 | must be set to 2 for compatibility with saved checkpoints.}
33 | 
34 | \item{token_type_embedding_name}{Character; the name of the embedding table
35 | variable for token type ids.}
36 | 
37 | \item{use_position_embeddings}{Logical; whether to add position embeddings
38 | for the position of each token in the sequence.}
39 | 
40 | \item{position_embedding_name}{Character; the name of the embedding table
41 | variable for positional embeddings.}
42 | 
43 | \item{initializer_range}{Numeric; range of the weight initialization.}
44 | 
45 | \item{max_position_embeddings}{Integer; maximum sequence length that might
46 | ever be used with this model. This can be longer than the sequence length
47 | of input_tensor, but cannot be shorter.}
48 | 
49 | \item{dropout_prob}{Numeric; dropout probability applied to the final output
50 | tensor.}
51 | }
52 | \value{
53 | Float Tensor with same shape as \code{input_tensor}.
54 | }
55 | \description{
56 | This function (optionally) adds to the word embeddings additional embeddings
57 | for token type and position.
58 | }
59 | \details{
60 | See figure 2 in the BERT paper:
61 | 
62 | \url{https://arxiv.org/pdf/1810.04805.pdf}
63 | 
64 | Both type and position embeddings are learned model variables. Note that
65 | token "type" is essentially a sentence identifier, indicating which sentence
66 | (or, more generally, piece of text) the token belongs to.
67 | }
68 | \examples{
69 | \dontrun{
70 | batch_size <- 10
71 | seq_length <- 512
72 | embedding_size <- 200
73 | with(tensorflow::tf$variable_scope("examples",
74 |   reuse = tensorflow::tf$AUTO_REUSE
75 | ), {
76 |   input_tensor <- tensorflow::tf$get_variable(
77 |     "input",
78 |     dtype = "float",
79 |     shape = tensorflow::shape(batch_size, seq_length, embedding_size)
80 |   )
81 |   token_type_ids <- tensorflow::tf$get_variable(
82 |     "ids",
83 |     dtype = "int32",
84 |     shape = tensorflow::shape(batch_size, seq_length)
85 |   )
86 | })
87 | embedding_postprocessor(input_tensor,
88 |   use_token_type = TRUE,
89 |   token_type_ids = token_type_ids
90 | )
91 | }
92 | }
93 | 


--------------------------------------------------------------------------------
/man/extract_features.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/extract_features.R
  3 | \name{extract_features}
  4 | \alias{extract_features}
  5 | \title{Extract output features from BERT}
  6 | \usage{
  7 | extract_features(
  8 |   examples,
  9 |   model = c("bert_base_uncased", "bert_base_cased", "bert_large_uncased",
 10 |     "bert_large_cased", "bert_large_uncased_wwm", "bert_large_cased_wwm",
 11 |     "bert_base_multilingual_cased", "bert_base_chinese", "scibert_scivocab_uncased",
 12 |     "scibert_scivocab_cased", "scibert_basevocab_uncased", "scibert_basevocab_cased"),
 13 |   ckpt_dir = NULL,
 14 |   vocab_file = find_vocab(ckpt_dir),
 15 |   bert_config_file = find_config(ckpt_dir),
 16 |   init_checkpoint = find_ckpt(ckpt_dir),
 17 |   output_file = NULL,
 18 |   max_seq_length = 128L,
 19 |   layer_indexes = -4:-1,
 20 |   batch_size = 2L,
 21 |   features = c("output", "attention"),
 22 |   verbose = FALSE
 23 | )
 24 | }
 25 | \arguments{
 26 | \item{examples}{List of \code{InputExample_EF} objects, or character
 27 | vector(s) that can be converted to \code{InputExample_EF} objects.}
 28 | 
 29 | \item{model}{Character; which model checkpoint to use. If specified,
 30 | \code{ckpt_dir}, code{vocab_file}, \code{bert_config_file}, and
 31 | \code{init_checkpoint} will be inferred. If you do not have this
 32 | checkpoint, you will be prompted to download it in interactive mode.}
 33 | 
 34 | \item{ckpt_dir}{Character; path to checkpoint directory. If specified, any
 35 | other checkpoint files required by this function (\code{vocab_file},
 36 | \code{bert_config_file}, or \code{init_checkpoint}) will default to
 37 | standard filenames within \code{ckpt_dir}.}
 38 | 
 39 | \item{vocab_file}{path to vocabulary file. File is assumed to be a text file,
 40 | with one token per line, with the line number corresponding to the index of
 41 | that token in the vocabulary.}
 42 | 
 43 | \item{bert_config_file}{Character; the path to a json config file.}
 44 | 
 45 | \item{init_checkpoint}{Character; path to the checkpoint directory, plus
 46 | checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and
 47 | explicit, starting with "/".}
 48 | 
 49 | \item{output_file}{(optional) Character; file path (stub) for writing output
 50 | to.}
 51 | 
 52 | \item{max_seq_length}{Integer; the maximum number of tokens that will be
 53 | considered together.}
 54 | 
 55 | \item{layer_indexes}{Integer vector; indexes (positive, or negative counting
 56 | back from the end) indicating which layers to extract as "output features".
 57 | The "zeroth" layer embeddings are the input embeddings vectors to the first
 58 | layer.}
 59 | 
 60 | \item{batch_size}{Integer; how many examples to process per batch.}
 61 | 
 62 | \item{features}{Character; whether to return "output" (layer outputs, the
 63 | default), "attention" (attention probabilities), or both.}
 64 | 
 65 | \item{verbose}{Logical; if FALSE, suppresses most of the TensorFlow chatter
 66 | by temporarily setting the logging threshold to its highest level. If TRUE,
 67 | keeps the current logging threshold, which defaults to "WARN". To change
 68 | the logging threshold of the current session, run
 69 | \code{tensorflow::tf$logging$set_verbosity(tensorflow::tf$logging$DEBUG)}
 70 | (setting whatever verbosity level you want).}
 71 | }
 72 | \value{
 73 | A list with elements "output" (the layer outputs as a tibble) and/or
 74 |   "attention" (the attention weights as a tibble).
 75 | }
 76 | \description{
 77 | Given example sentences (as a list of \code{InputExample_EF}s), apply an
 78 | existing BERT model and capture certain output layers. (These could
 79 | potentially be used as features in downstream tasks.)
 80 | }
 81 | \examples{
 82 | \dontrun{
 83 | BERT_PRETRAINED_DIR <- download_BERT_checkpoint("bert_base_uncased")
 84 | examples <- c("I saw the branch on the bank.",
 85 |               "I saw the branch of the bank.")
 86 | 
 87 | # Just specify checkpoint directory.
 88 | feats <- extract_features(
 89 |   examples = examples,
 90 |   ckpt_dir = BERT_PRETRAINED_DIR
 91 | )
 92 | # Can also just specify the model, if you have it downloaded.
 93 | # In interactive mode, you'll be prompted to download the model if you do not
 94 | # have it.
 95 | feats <- extract_features(
 96 |   examples = examples,
 97 |   model = "bert_base_uncased"
 98 | )
 99 | }
100 | }
101 | 


--------------------------------------------------------------------------------
/man/figures/rbert_hex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/man/figures/rbert_hex.png


--------------------------------------------------------------------------------
/man/file_based_convert_examples_to_features.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_classifier.R
 3 | \name{file_based_convert_examples_to_features}
 4 | \alias{file_based_convert_examples_to_features}
 5 | \title{Convert a set of \code{InputExample}s to a TFRecord file.}
 6 | \usage{
 7 | file_based_convert_examples_to_features(
 8 |   examples,
 9 |   label_list,
10 |   max_seq_length,
11 |   tokenizer,
12 |   output_file
13 | )
14 | }
15 | \arguments{
16 | \item{examples}{List of \code{InputExample}s to convert.}
17 | 
18 | \item{label_list}{Character (or integer?); possible labels for examples.}
19 | 
20 | \item{max_seq_length}{Integer; the maximum number of tokens that will be
21 | considered together.}
22 | 
23 | \item{tokenizer}{A tokenizer object to use (e.g. object of class
24 | FullTokenizer).}
25 | 
26 | \item{output_file}{Character; path to file to write to.}
27 | }
28 | \value{
29 | return value
30 | }
31 | \description{
32 | description
33 | }
34 | 


--------------------------------------------------------------------------------
/man/file_based_input_fn_builder.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_classifier.R
 3 | \name{file_based_input_fn_builder}
 4 | \alias{file_based_input_fn_builder}
 5 | \title{summary}
 6 | \usage{
 7 | file_based_input_fn_builder(x)
 8 | }
 9 | \arguments{
10 | \item{x}{This parameter will be described when this function is implemented.}
11 | }
12 | \value{
13 | return value
14 | }
15 | \description{
16 | description
17 | }
18 | 


--------------------------------------------------------------------------------
/man/find_files.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{find_files}
 4 | \alias{find_files}
 5 | \alias{find_vocab}
 6 | \alias{find_config}
 7 | \alias{find_ckpt}
 8 | \title{Find Checkpoint Files}
 9 | \usage{
10 | find_vocab(ckpt_dir)
11 | 
12 | find_config(ckpt_dir)
13 | 
14 | find_ckpt(ckpt_dir)
15 | }
16 | \arguments{
17 | \item{ckpt_dir}{Character; the path to the checkpoint directory. If this
18 | argument is NULL, the associated functions also return NULL.}
19 | }
20 | \description{
21 | Given the path to a checkpoint directory, return the paths to certain files
22 | in that directory.
23 | }
24 | \section{Functions}{
25 | \itemize{
26 | \item \code{find_vocab}: Find the vocabulary file ('vocab.txt').
27 | 
28 | \item \code{find_config}: Find the config file ('bert_config.json').
29 | 
30 | \item \code{find_ckpt}: Find the checkpoint file stub (files begin with
31 | 'bert_model.ckpt').
32 | }}
33 | 
34 | 


--------------------------------------------------------------------------------
/man/gelu.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{gelu}
 4 | \alias{gelu}
 5 | \title{Gaussian Error Linear Unit}
 6 | \usage{
 7 | gelu(x)
 8 | }
 9 | \arguments{
10 | \item{x}{Float Tensor to perform activation on.}
11 | }
12 | \value{
13 | `x` with the GELU activation applied.
14 | }
15 | \description{
16 | This is a smoother version of the RELU. Original paper:
17 | https://arxiv.org/abs/1606.08415
18 | }
19 | \examples{
20 | \dontrun{
21 | tfx <- tensorflow::tf$get_variable("none", tensorflow::shape(10L))
22 | gelu(tfx)
23 | }
24 | }
25 | 


--------------------------------------------------------------------------------
/man/get_activation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{get_activation}
 4 | \alias{get_activation}
 5 | \title{Map a string to a Python function}
 6 | \usage{
 7 | get_activation(activation_string)
 8 | }
 9 | \arguments{
10 | \item{activation_string}{String name of the activation function.}
11 | }
12 | \value{
13 | A function corresponding to the activation function. If
14 |   \code{activation_string} is NA, empty, or "linear", this will return NA. If
15 |   \code{activation_string} is not a string, it will return
16 |   \code{activation_string}.
17 | }
18 | \description{
19 | Example: "relu" => `tensorflow::tf$nn$relu`.
20 | }
21 | \examples{
22 | \dontrun{
23 | get_activation("gelu")
24 | }
25 | }
26 | 


--------------------------------------------------------------------------------
/man/get_assignment_map_from_checkpoint.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{get_assignment_map_from_checkpoint}
 4 | \alias{get_assignment_map_from_checkpoint}
 5 | \title{Compute the intersection of the current variables and checkpoint variables}
 6 | \usage{
 7 | get_assignment_map_from_checkpoint(tvars, init_checkpoint)
 8 | }
 9 | \arguments{
10 | \item{tvars}{List of training variables in the current model.}
11 | 
12 | \item{init_checkpoint}{Character; path to the checkpoint directory, plus
13 | checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and
14 | explicit, starting with "/".}
15 | }
16 | \value{
17 | List with two elements: the assignment map and the initialized
18 |   variable names. The assignment map is a list of the "base" variable names
19 |   that are in both the current computational graph and the checkpoint. The
20 |   initialized variable names list contains both the base names and the base
21 |   names + ":0". (This seems redundant to me. I assume it will make sense
22 |   later. -JDB)
23 | }
24 | \description{
25 | Returns the intersection (not the union, as python docs say -JDB) of the sets
26 | of variable names from the current graph and the checkpoint.
27 | }
28 | \details{
29 | Note that a Tensorflow checkpoint is not the same as a saved model. A saved
30 | model contains a complete description of the computational graph and is
31 | sufficient to reconstruct the entire model, while a checkpoint contains just
32 | the parameter values (and variable names), and so requires a specification of
33 | the original model structure to reconstruct the computational graph. -JDB
34 | }
35 | \examples{
36 | \dontrun{
37 | # Just for illustration: create a "model" with a couple variables
38 | # that overlap some variable names in the BERT checkpoint.
39 | with(tensorflow::tf$variable_scope("bert",
40 |   reuse = tensorflow::tf$AUTO_REUSE
41 | ), {
42 |   test_ten1 <- tensorflow::tf$get_variable(
43 |     "encoder/layer_9/output/dense/bias",
44 |     shape = c(1L, 2L, 3L)
45 |   )
46 |   test_ten2 <- tensorflow::tf$get_variable(
47 |     "encoder/layer_9/output/dense/kernel",
48 |     shape = c(1L, 2L, 3L)
49 |   )
50 | })
51 | tvars <- tensorflow::tf$get_collection(
52 |   tensorflow::tf$GraphKeys$GLOBAL_VARIABLES
53 | )
54 | temp_dir <- tempdir()
55 | init_checkpoint <- file.path(
56 |   temp_dir,
57 |   "BERT_checkpoints",
58 |   "uncased_L-12_H-768_A-12",
59 |   "bert_model.ckpt"
60 | )
61 | 
62 | amap <- get_assignment_map_from_checkpoint(tvars, init_checkpoint)
63 | }
64 | }
65 | 


--------------------------------------------------------------------------------
/man/get_shape_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{get_shape_list}
 4 | \alias{get_shape_list}
 5 | \title{Return the shape of tensor}
 6 | \usage{
 7 | get_shape_list(tensor, expected_rank = NULL, name = NULL)
 8 | }
 9 | \arguments{
10 | \item{tensor}{A tf.Tensor object to find the shape of.}
11 | 
12 | \item{expected_rank}{The expected rank of \code{tensor}, as an integer vector
13 | or list. If this is specified and the \code{tensor} has a rank not listed
14 | in \code{expected_rank}, an exception will be thrown.}
15 | 
16 | \item{name}{Optional name of the tensor for the error message.}
17 | }
18 | \value{
19 | A list of dimensions of the shape of tensor. All static dimensions
20 |   will be returned as native integers, and dynamic dimensions will be
21 |   returned as tf.Tensor scalars. (I'm not very comfortable with this
22 |   behavior. It's not usually good practice to make the return type vary
23 |   depending on the input.)
24 | }
25 | \description{
26 | Returns a list of the shape of tensor, preferring static dimensions. (A
27 | static dimension is known at graph definition time, and a dynamic dimension
28 | is known only at graph execution time.)
29 | https://stackoverflow.com/questions/37096225/
30 | }
31 | \examples{
32 | \dontrun{
33 | with(tensorflow::tf$variable_scope("examples",
34 |   reuse = tensorflow::tf$AUTO_REUSE
35 | ), {
36 |   phx <- tensorflow::tf$placeholder(tensorflow::tf$int32, shape = c(4))
37 |   get_shape_list(phx) # static
38 |   tfu <- tensorflow::tf$unique(phx)
39 |   tfy <- tfu$y
40 |   get_shape_list(tfy) # dynamic
41 | })
42 | }
43 | }
44 | 


--------------------------------------------------------------------------------
/man/input_fn_builder.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_classifier.R
 3 | \name{input_fn_builder}
 4 | \alias{input_fn_builder}
 5 | \title{Create an \code{input_fn} closure to be passed to TPUEstimator}
 6 | \usage{
 7 | input_fn_builder(features, seq_length, is_training, drop_remainder)
 8 | }
 9 | \arguments{
10 | \item{features}{A list of features (objects of class \code{InputFeatures}).}
11 | 
12 | \item{seq_length}{Integer; the maximum length (number of tokens) of each
13 | example. (Examples should already be padded to this length by this point.)}
14 | 
15 | \item{is_training}{Logical; whether these are training examples.}
16 | 
17 | \item{drop_remainder}{Logical; whether to drop the extra if the number of
18 | elements in the dataset is not an exact multiple of the batch size,}
19 | }
20 | \value{
21 | An \code{input_fn} closure to be passed to TPUEstimator.
22 | }
23 | \description{
24 | Creates an \code{input_fn} closure to be passed to TPUEstimator. The output
25 | of this closure is the (modified) output of
26 | \code{tensorflow::tf$data$Dataset$from_tensor_slices} (an object of class
27 | "tensorflow.python.data.ops.dataset_ops.BatchDataset").
28 | }
29 | \examples{
30 | \dontrun{
31 | tokenizer <- FullTokenizer("vocab.txt")
32 | seq_len <- 15L
33 | input_ex1 <- InputExample(
34 |   guid = 1L,
35 |   text_a = "Some text to classify.",
36 |   text_b = "More wordy words.",
37 |   label = "good"
38 | )
39 | input_ex2 <- InputExample(
40 |   guid = 2L,
41 |   text_a = "This is another example.",
42 |   text_b = "So many words.",
43 |   label = "bad"
44 | )
45 | feat <- convert_examples_to_features(
46 |   examples = list(input_ex1, input_ex2),
47 |   label_list = c("good", "bad"),
48 |   max_seq_length = seq_len,
49 |   tokenizer = tokenizer
50 | )
51 | input_fn <- input_fn_builder(
52 |   features = feat,
53 |   seq_length = seq_len,
54 |   is_training = TRUE,
55 |   drop_remainder = FALSE
56 | )
57 | }
58 | }
59 | 


--------------------------------------------------------------------------------
/man/input_fn_builder_EF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extract_features.R
 3 | \name{input_fn_builder_EF}
 4 | \alias{input_fn_builder_EF}
 5 | \title{Create an \code{input_fn} closure to be passed to TPUEstimator}
 6 | \usage{
 7 | input_fn_builder_EF(features, seq_length)
 8 | }
 9 | \arguments{
10 | \item{features}{A list of features (objects of class
11 | \code{InputFeatures_EF}).}
12 | 
13 | \item{seq_length}{Integer; the maximum length (number of tokens) of each
14 | example. (Examples should already be padded to this length by this point.)}
15 | }
16 | \value{
17 | An \code{input_fn} closure to be passed to TPUEstimator.
18 | }
19 | \description{
20 | Creates an \code{input_fn} closure to be passed to TPUEstimator. The output
21 | of this closure is the (modified) output of
22 | \code{tensorflow::tf$data$Dataset$from_tensor_slices} (an object of class
23 | "tensorflow.python.data.ops.dataset_ops.BatchDataset"). This function is
24 | similar to \code{input_fn_builder} from run_classifier.R. (RBERT issue #28.)
25 | }
26 | \keyword{internal}
27 | 


--------------------------------------------------------------------------------
/man/is_chinese_char.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{is_chinese_char}
 4 | \alias{is_chinese_char}
 5 | \title{Check whether cp is the codepoint of a CJK character.}
 6 | \usage{
 7 | is_chinese_char(cp)
 8 | }
 9 | \arguments{
10 | \item{cp}{A unicode codepoint, as an integer.}
11 | }
12 | \value{
13 | Logical TRUE if cp is codepoint of a CJK character.
14 | }
15 | \description{
16 | (R implementation of BasicTokenizer._is_chinese_char from
17 | BERT: tokenization.py. From that file:
18 |  This defines a "chinese character" as anything in the CJK Unicode block:
19 |   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
20 | }
21 | \details{
22 | Note that the CJK Unicode block is NOT all Japanese and Korean characters,
23 | despite its name. The modern Korean Hangul alphabet is a different block,
24 | as is Japanese Hiragana and Katakana. Those alphabets are used to write
25 | space-separated words, so they are not treated specially and are handled
26 | like the alphabets of the other languages.)
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/is_control.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{is_control}
 4 | \alias{is_control}
 5 | \title{Check whether `char` is a control character.}
 6 | \usage{
 7 | is_control(char)
 8 | }
 9 | \arguments{
10 | \item{char}{A character scalar, comprising a single unicode character.}
11 | }
12 | \value{
13 | TRUE if char is a control character.
14 | }
15 | \description{
16 | (R implementation of _is_control from BERT: tokenization.py.)
17 | }
18 | \details{
19 | "\\t", "\\n", and "\\r" are technically control characters but we treat them
20 | as whitespace since they are generally considered as such.
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/is_punctuation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{is_punctuation}
 4 | \alias{is_punctuation}
 5 | \title{Check whether `char` is a punctuation character.}
 6 | \usage{
 7 | is_punctuation(char)
 8 | }
 9 | \arguments{
10 | \item{char}{A character scalar, comprising a single unicode character.}
11 | }
12 | \value{
13 | TRUE if char is a punctuation character.
14 | }
15 | \description{
16 | (R implementation of _is_punctuation from BERT: tokenization.py.)
17 | }
18 | \details{
19 | We treat all non-letter/number ASCII as punctuation.
20 | Characters such as "^", "$", and "`" are not in the Unicode
21 | Punctuation class but we treat them as punctuation anyway, for
22 | consistency.
23 | }
24 | \keyword{internal}
25 | 


--------------------------------------------------------------------------------
/man/is_whitespace.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{is_whitespace}
 4 | \alias{is_whitespace}
 5 | \title{Check whether `char` is a whitespace character.}
 6 | \usage{
 7 | is_whitespace(char)
 8 | }
 9 | \arguments{
10 | \item{char}{A character scalar, comprising a single unicode character.}
11 | }
12 | \value{
13 | TRUE if char is a whitespace character.
14 | }
15 | \description{
16 | (R implementation of _is_whitespace from BERT: tokenization.py.)
17 | }
18 | \details{
19 | "\\t", "\\n", and "\\r" are technically control characters but we treat them
20 | as whitespace since they are generally considered as such.
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/layer_norm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{layer_norm}
 4 | \alias{layer_norm}
 5 | \title{Run layer normalization}
 6 | \usage{
 7 | layer_norm(input_tensor, name = NULL)
 8 | }
 9 | \arguments{
10 | \item{input_tensor}{Tensor to perform layor normalization on.}
11 | 
12 | \item{name}{Optional variable_scope for layer_norm.}
13 | }
14 | \value{
15 | A Tensor of the same shape and type as `input_tensor`, with
16 |   normalization applied.
17 | }
18 | \description{
19 | Run layer normalization on the last dimension of the tensor.
20 | }
21 | \details{
22 | Wrapper around tensorflow layer_norm function. From tensorflow documentation:
23 | Adds a Layer Normalization layer. Based on the paper:
24 | \url{https://arxiv.org/abs/1607.06450}.
25 | 
26 | Note: \code{begin_norm_axis}: The first normalization dimension:
27 | normalization will be performed along dimensions (begin_norm_axis :
28 | rank(inputs) )
29 | 
30 | \code{begin_params_axis}: The first parameter (beta, gamma) dimension: scale
31 | and centering parameters will have dimensions (begin_params_axis :
32 | rank(inputs) ) and will be broadcast with the normalized inputs accordingly.
33 | }
34 | \examples{
35 | \dontrun{
36 | tfx <- tensorflow::tf$get_variable("example", tensorflow::shape(10L))
37 | layer_norm(tfx)
38 | }
39 | }
40 | 


--------------------------------------------------------------------------------
/man/layer_norm_and_dropout.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{layer_norm_and_dropout}
 4 | \alias{layer_norm_and_dropout}
 5 | \title{Run layer normalization followed by dropout}
 6 | \usage{
 7 | layer_norm_and_dropout(input_tensor, dropout_prob = NULL, name = NULL)
 8 | }
 9 | \arguments{
10 | \item{input_tensor}{Float Tensor to perform layer_norm and dropout on.}
11 | 
12 | \item{dropout_prob}{A double describing the probability of dropping out a
13 | value (NOT of KEEPING a dimension as in `tf.nn.dropout`).}
14 | 
15 | \item{name}{Optional variable_scope for layer_norm.}
16 | }
17 | \value{
18 | Tensor resulting from applying layer_norm and dropout to
19 |   \code{input_tensor}.
20 | }
21 | \description{
22 | Run layer normalization followed by dropout
23 | }
24 | \examples{
25 | \dontrun{
26 | tfx <- tensorflow::tf$get_variable("example2", tensorflow::shape(10L))
27 | layer_norm_and_dropout(tfx, dropout_prob = 0.5)
28 | }
29 | }
30 | 


--------------------------------------------------------------------------------
/man/load_vocab.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{load_vocab}
 4 | \alias{load_vocab}
 5 | \title{Load a vocabulary file}
 6 | \usage{
 7 | load_vocab(vocab_file)
 8 | }
 9 | \arguments{
10 | \item{vocab_file}{path to vocabulary file. File is assumed to be a text
11 | file, with one token per line, with the line number corresponding to the
12 | index of that token in the vocabulary.}
13 | }
14 | \value{
15 | In the BERT Python code, the vocab is returned as an OrderedDict
16 | from the collections package. Here we return the vocab as a named integer
17 | vector. Names are tokens in vocabulary, values are integer indices.
18 | }
19 | \description{
20 | Load a vocabulary file
21 | }
22 | \examples{
23 | \dontrun{
24 | vocab <- load_vocab(vocab_file = "vocab.txt")
25 | }
26 | }
27 | 


--------------------------------------------------------------------------------
/man/make_examples_simple.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extract_features.R
 3 | \name{make_examples_simple}
 4 | \alias{make_examples_simple}
 5 | \title{Easily make examples for BERT}
 6 | \usage{
 7 | make_examples_simple(seq_list)
 8 | }
 9 | \arguments{
10 | \item{seq_list}{Character vector or list; text to turn into examples.}
11 | }
12 | \value{
13 | A list of \code{InputExample_EF} objects.
14 | }
15 | \description{
16 | A simple wrapper function to turn a list of text (as a character
17 | vector or list) into a list of examples suitable for use with RBERT. If the
18 | input is a flat list or vector of characters, the examples will be
19 | single-segment, with NULL for the second segment. If the input contains
20 | length-2 sublists or vectors, those examples will be two-segment sequences,
21 | e.g. for doing sentence-pair classification.
22 | }
23 | \examples{
24 | input_ex <- make_examples_simple(c(
25 |   "Here are some words.",
26 |   "Here are some more words."
27 | ))
28 | input_ex2 <- make_examples_simple(list(
29 |   c(
30 |     "First sequence, first segment.",
31 |     "First sequence, second segment."
32 |   ),
33 |   c(
34 |     "Second sequence, first segment.",
35 |     "Second sequence, second segment."
36 |   )
37 | ))
38 | }
39 | 


--------------------------------------------------------------------------------
/man/model_fn_builder.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/run_classifier.R
  3 | \name{model_fn_builder}
  4 | \alias{model_fn_builder}
  5 | \title{Define \code{model_fn} closure for \code{TPUEstimator}}
  6 | \usage{
  7 | model_fn_builder(
  8 |   bert_config,
  9 |   num_labels,
 10 |   init_checkpoint,
 11 |   learning_rate,
 12 |   num_train_steps,
 13 |   num_warmup_steps,
 14 |   use_tpu
 15 | )
 16 | }
 17 | \arguments{
 18 | \item{bert_config}{\code{BertConfig} instance.}
 19 | 
 20 | \item{num_labels}{Integer; number of classification labels.}
 21 | 
 22 | \item{init_checkpoint}{Character; path to the checkpoint directory, plus
 23 | checkpoint name stub (e.g. "bert_model.ckpt"). Path must be absolute and
 24 | explicit, starting with "/".}
 25 | 
 26 | \item{learning_rate}{Numeric; the learning rate.}
 27 | 
 28 | \item{num_train_steps}{Integer; number of steps to train for.}
 29 | 
 30 | \item{num_warmup_steps}{Integer; number of steps to use for "warm-up".}
 31 | 
 32 | \item{use_tpu}{Logical; whether to use TPU.}
 33 | }
 34 | \value{
 35 | \code{model_fn} closure for \code{TPUEstimator}.
 36 | }
 37 | \description{
 38 | Returns \code{model_fn} closure, which is an input to \code{TPUEstimator}.
 39 | }
 40 | \details{
 41 | The \code{model_fn} function takes four parameters: \describe{
 42 | \item{features}{A list (or similar structure) that contains objects such as
 43 |  \code{input_ids}, \code{input_mask}, \code{segment_ids},  and
 44 |  \code{label_ids}. These objects will be inputs to the \code{create_model}
 45 |  function.}
 46 | \item{labels}{Not used in this function, but presumably we need to
 47 |  keep this slot here.}
 48 | \item{mode}{Character; value such as "train", "infer",
 49 |  or "eval".}
 50 | \item{params}{Not used in this function, but presumably we need
 51 |  to keep this slot here.}
 52 |  }
 53 | 
 54 | The output of \code{model_fn} is the result of a
 55 | \code{tf$contrib$tpu$TPUEstimatorSpec} call.
 56 | 
 57 | This reference may be helpful:
 58 | \url{https://tensorflow.rstudio.com/tfestimators/articles/creating_estimators.html}
 59 | }
 60 | \examples{
 61 | \dontrun{
 62 | with(tensorflow::tf$variable_scope("examples",
 63 |   reuse = tensorflow::tf$AUTO_REUSE
 64 | ), {
 65 |   input_ids <- tensorflow::tf$constant(list(
 66 |     list(31L, 51L, 99L),
 67 |     list(15L, 5L, 0L)
 68 |   ))
 69 | 
 70 |   input_mask <- tensorflow::tf$constant(list(
 71 |     list(1L, 1L, 1L),
 72 |     list(1L, 1L, 0L)
 73 |   ))
 74 |   token_type_ids <- tensorflow::tf$constant(list(
 75 |     list(0L, 0L, 1L),
 76 |     list(0L, 2L, 0L)
 77 |   ))
 78 |   config <- BertConfig(
 79 |     vocab_size = 30522L,
 80 |     hidden_size = 768L,
 81 |     num_hidden_layers = 8L,
 82 |     type_vocab_size = 2L,
 83 |     num_attention_heads = 12L,
 84 |     intermediate_size = 3072L
 85 |   )
 86 | 
 87 |   temp_dir <- tempdir()
 88 |   init_checkpoint <- file.path(
 89 |     temp_dir,
 90 |     "BERT_checkpoints",
 91 |     "uncased_L-12_H-768_A-12",
 92 |     "bert_model.ckpt"
 93 |   )
 94 | 
 95 |   example_mod_fn <- model_fn_builder(
 96 |     bert_config = config,
 97 |     num_labels = 2L,
 98 |     init_checkpoint = init_checkpoint,
 99 |     learning_rate = 0.01,
100 |     num_train_steps = 20L,
101 |     num_warmup_steps = 10L,
102 |     use_tpu = FALSE
103 |   )
104 | })
105 | }
106 | }
107 | 


--------------------------------------------------------------------------------
/man/reshape_from_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{reshape_from_matrix}
 4 | \alias{reshape_from_matrix}
 5 | \title{Turn a matrix into a tensor}
 6 | \usage{
 7 | reshape_from_matrix(output_tensor, orig_shape_list)
 8 | }
 9 | \arguments{
10 | \item{output_tensor}{Tensor to reshape. What a lousy name for an input.}
11 | 
12 | \item{orig_shape_list}{Shape to cast Tensor into.}
13 | }
14 | \value{
15 | The Tensor reshaped to rank specified by orig_shape_list.
16 | }
17 | \description{
18 | Reshapes a rank 2 tensor back to its original rank >= 2 tensor. The final
19 | dimension ('width') of the tensor is assumed to be preserved. If a different
20 | width is requested, function will complain.
21 | }
22 | \examples{
23 | \dontrun{
24 | with(
25 |   tensorflow::tf$variable_scope("examples",
26 |     reuse = tensorflow::tf$AUTO_REUSE
27 |   ),
28 |   r2t <- tensorflow::tf$get_variable("r2t",
29 |     dtype = "int32",
30 |     shape = c(10, 20)
31 |   )
32 | )
33 | reshape_from_matrix(r2t, orig_shape_list = c(5L, 2L, 20L))
34 | }
35 | }
36 | 


--------------------------------------------------------------------------------
/man/reshape_to_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{reshape_to_matrix}
 4 | \alias{reshape_to_matrix}
 5 | \title{Turn a tensor into a matrix}
 6 | \usage{
 7 | reshape_to_matrix(input_tensor)
 8 | }
 9 | \arguments{
10 | \item{input_tensor}{Tensor to reshape.}
11 | }
12 | \value{
13 | The Tensor reshaped to rank 2.
14 | }
15 | \description{
16 | Reshapes a >= rank 2 tensor to a rank 2 tensor. The last dimension is
17 | preserved; the rest are flattened.
18 | }
19 | \examples{
20 | \dontrun{
21 | with(
22 |   tensorflow::tf$variable_scope("examples",
23 |     reuse = tensorflow::tf$AUTO_REUSE
24 |   ),
25 |   r3t <- tensorflow::tf$get_variable("r3t",
26 |     dtype = "int32",
27 |     shape = c(10, 20, 3)
28 |   )
29 | )
30 | reshape_to_matrix(r3t)
31 | }
32 | }
33 | 


--------------------------------------------------------------------------------
/man/set_BERT_dir.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download_checkpoint.R
 3 | \name{set_BERT_dir}
 4 | \alias{set_BERT_dir}
 5 | \title{Set the directory for BERT checkpoints}
 6 | \usage{
 7 | set_BERT_dir(dir)
 8 | }
 9 | \arguments{
10 | \item{dir}{Character vector. Destination directory for checkpoints. Leave
11 | \code{NULL} to allow RBERT to automatically choose a directory. The path is
12 | determined from the \code{dir} parameter if supplied, followed by the
13 | `RBERT.dir` option (set using \link{set_BERT_dir}), followed by an "RBERT"
14 | folder in the user cache directory (determined using
15 | \code{\link[rappdirs]{user_cache_dir}}). If you provide a \code{dir}, the
16 | `RBERT.dir` option will be updated to that location. Note that the
17 | checkpoint will create a subdirectory inside this \code{dir}.}
18 | }
19 | \value{
20 | A list with the previous value of `BERT.dir` (invisibly).
21 | }
22 | \description{
23 | Set a given \code{dir} as the default BERT checkpoint directory for this
24 | session, and create it if it does not exist.
25 | }
26 | \examples{
27 | \dontrun{
28 | set_BERT_dir("fake_dir")
29 | }
30 | }
31 | 


--------------------------------------------------------------------------------
/man/split_on_punc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{split_on_punc}
 4 | \alias{split_on_punc}
 5 | \title{Split text on punctuation.}
 6 | \usage{
 7 | split_on_punc(text)
 8 | }
 9 | \arguments{
10 | \item{text}{A character scalar, encoded as utf-8.}
11 | }
12 | \value{
13 | The input text as a character vector, split on punctuation
14 | characters.
15 | }
16 | \description{
17 | (R implementation of BasicTokenizer._run_split_on_punc from
18 | BERT: tokenization.py.)
19 | }
20 | \keyword{internal}
21 | 


--------------------------------------------------------------------------------
/man/strip_accents.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{strip_accents}
 4 | \alias{strip_accents}
 5 | \title{Strip accents from a piece of text.}
 6 | \usage{
 7 | strip_accents(text)
 8 | }
 9 | \arguments{
10 | \item{text}{A character scalar, encoded as utf-8.}
11 | }
12 | \value{
13 | text with accents removed.
14 | }
15 | \description{
16 | (R implementation of BasicTokenizer._run_strip_accents from
17 | BERT: tokenization.py.)
18 | }
19 | \keyword{internal}
20 | 


--------------------------------------------------------------------------------
/man/tokenize.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{tokenize}
 4 | \alias{tokenize}
 5 | \alias{tokenize.FullTokenizer}
 6 | \alias{tokenize.BasicTokenizer}
 7 | \alias{tokenize.WordpieceTokenizer}
 8 | \title{Tokenizers for various objects.}
 9 | \usage{
10 | tokenize(tokenizer, text)
11 | 
12 | \method{tokenize}{FullTokenizer}(tokenizer, text)
13 | 
14 | \method{tokenize}{BasicTokenizer}(tokenizer, text)
15 | 
16 | \method{tokenize}{WordpieceTokenizer}(tokenizer, text)
17 | }
18 | \arguments{
19 | \item{tokenizer}{The Tokenizer object to refer to.}
20 | 
21 | \item{text}{The text to tokenize. For tokenize.WordpieceTokenizer, the text
22 | should have already been passed through BasicTokenizer.}
23 | }
24 | \value{
25 | A list of tokens.
26 | }
27 | \description{
28 | This tokenizer performs some basic cleaning, then splits up text on
29 | whitespace and punctuation.
30 | }
31 | \section{Methods (by class)}{
32 | \itemize{
33 | \item \code{FullTokenizer}: Tokenizer method for objects of FullTokenizer class.
34 | 
35 | \item \code{BasicTokenizer}: Tokenizer method for objects of BasicTokenizer class.
36 | 
37 | \item \code{WordpieceTokenizer}: Tokenizer method for objects of WordpieceTokenizer
38 | class. This uses a greedy longest-match-first algorithm to perform
39 | tokenization using the given vocabulary. For example: input = "unaffable"
40 | output = list("un", "##aff", "##able") ... although, ironically, the BERT
41 | vocabulary actually gives output = list("una", "##ffa", "##ble") for this
42 | example, even though they use it as an example in their code.
43 | }}
44 | 
45 | \examples{
46 | \dontrun{
47 | tokenizer <- FullTokenizer("vocab.txt", TRUE)
48 | tokenize(tokenizer, text = "a bunch of words")
49 | }
50 | \dontrun{
51 | tokenizer <- BasicTokenizer(TRUE)
52 | tokenize(tokenizer, text = "a bunch of words")
53 | }
54 | \dontrun{
55 | vocab <- load_vocab(vocab_file = "vocab.txt")
56 | tokenizer <- WordpieceTokenizer(vocab)
57 | tokenize(tokenizer, text = "a bunch of words")
58 | }
59 | }
60 | 


--------------------------------------------------------------------------------
/man/tokenize_chinese_chars.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{tokenize_chinese_chars}
 4 | \alias{tokenize_chinese_chars}
 5 | \title{Add whitespace around any CJK character.}
 6 | \usage{
 7 | tokenize_chinese_chars(text)
 8 | }
 9 | \arguments{
10 | \item{text}{A character scalar.}
11 | }
12 | \value{
13 | Text with spaces around CJK characters.
14 | }
15 | \description{
16 | (R implementation of BasicTokenizer._tokenize_chinese_chars from
17 | BERT: tokenization.py.) This may result in doubled-up spaces,
18 | but that's the behavior of the python code...
19 | }
20 | \keyword{internal}
21 | 


--------------------------------------------------------------------------------
/man/tokenize_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{tokenize_text}
 4 | \alias{tokenize_text}
 5 | \title{Tokenize Text with Word Pieces}
 6 | \usage{
 7 | tokenize_text(
 8 |   text,
 9 |   ckpt_dir = NULL,
10 |   vocab_file = find_vocab(ckpt_dir),
11 |   include_special = TRUE
12 | )
13 | }
14 | \arguments{
15 | \item{text}{Character vector; text to tokenize.}
16 | 
17 | \item{ckpt_dir}{Character; path to checkpoint directory. If specified, any
18 | other checkpoint files required by this function (\code{vocab_file},
19 | \code{bert_config_file}, or \code{init_checkpoint}) will default to
20 | standard filenames within \code{ckpt_dir}.}
21 | 
22 | \item{vocab_file}{path to vocabulary file. File is assumed to be a text file,
23 | with one token per line, with the line number corresponding to the index of
24 | that token in the vocabulary.}
25 | 
26 | \item{include_special}{Logical; whether to add the special tokens "[CLS]" (at
27 | the beginning) and "[SEP]" (at the end) of the token list.}
28 | }
29 | \value{
30 | A list of character vectors, giving the tokenization of the input
31 |   text.
32 | }
33 | \description{
34 | Given some text and a word piece vocabulary, tokenizes the text. This is
35 | primarily a tool for quickly checking the tokenization of a piece of text.
36 | }
37 | \examples{
38 | \dontrun{
39 | BERT_PRETRAINED_DIR <- download_BERT_checkpoint("bert_base_uncased")
40 | tokens <- tokenize_text(
41 |   text = c("Who doesn't like tacos?", "Not me!"),
42 |   ckpt_dir = BERT_PRETRAINED_DIR
43 | )
44 | }
45 | }
46 | 


--------------------------------------------------------------------------------
/man/tokenize_word.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{tokenize_word}
 4 | \alias{tokenize_word}
 5 | \title{Tokenize a single "word" (no whitespace).}
 6 | \usage{
 7 | tokenize_word(word, vocab, unk_token = "[UNK]", max_chars = 100)
 8 | }
 9 | \arguments{
10 | \item{word}{Word to tokenize.}
11 | 
12 | \item{vocab}{Character vector containing vocabulary words}
13 | 
14 | \item{unk_token}{Token to represent unknown words.}
15 | 
16 | \item{max_chars}{Maximum length of word recognized.}
17 | }
18 | \value{
19 | Input word as a list of tokens.
20 | }
21 | \description{
22 | In BERT: tokenization.py,
23 | this code is inside the tokenize method for WordpieceTokenizer objects.
24 | I've moved it into its own function for clarity.
25 | Punctuation should already have been removed from the word.
26 | }
27 | \examples{
28 | tokenize_word("unknown", vocab = c("un" = 0, "##known" = 1))
29 | tokenize_word("known", vocab = c("un" = 0, "##known" = 1))
30 | }
31 | 


--------------------------------------------------------------------------------
/man/transformer_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{transformer_model}
 4 | \alias{transformer_model}
 5 | \title{Build multi-head, multi-layer Transformer}
 6 | \usage{
 7 | transformer_model(
 8 |   input_tensor,
 9 |   attention_mask = NULL,
10 |   hidden_size = 768L,
11 |   num_hidden_layers = 12L,
12 |   num_attention_heads = 12L,
13 |   intermediate_size = 3072L,
14 |   intermediate_act_fn = gelu,
15 |   hidden_dropout_prob = 0.1,
16 |   attention_probs_dropout_prob = 0.1,
17 |   initializer_range = 0.02,
18 |   do_return_all_layers = FALSE
19 | )
20 | }
21 | \arguments{
22 | \item{input_tensor}{Float Tensor of shape \code{[batch_size, seq_length,
23 | hidden_size]}.}
24 | 
25 | \item{attention_mask}{(Optional) Integer Tensor of shape \code{batch_size,
26 | seq_length, seq_length}, with 1 for positions that can be attended to and 0
27 | in positions that should not be.}
28 | 
29 | \item{hidden_size}{Integer; hidden size of the Transformer.}
30 | 
31 | \item{num_hidden_layers}{Integer; number of layers (blocks) in the
32 | Transformer.}
33 | 
34 | \item{num_attention_heads}{Integer; number of attention heads in the
35 | Transformer.}
36 | 
37 | \item{intermediate_size}{Integer; the size of the "intermediate" (a.k.a.,
38 | feed forward) layer.}
39 | 
40 | \item{intermediate_act_fn}{The non-linear activation function to apply to the
41 | output of the intermediate/feed-forward layer. (Function, not character.)}
42 | 
43 | \item{hidden_dropout_prob}{Numeric; the dropout probability for the hidden
44 | layers.}
45 | 
46 | \item{attention_probs_dropout_prob}{Numeric; the dropout probability of the
47 | attention probabilities.}
48 | 
49 | \item{initializer_range}{Numeric; the range of the initializer (stddev of
50 | truncated normal).}
51 | 
52 | \item{do_return_all_layers}{Logical; whether to also return all layers or
53 | just the final layer. If this is TRUE, will also return attention
54 | probabilities.}
55 | }
56 | \value{
57 | float Tensor of shape \code{[batch_size, seq_length, hidden_size]},
58 |   the final hidden layer of the Transformer. Or if `do_return_all_layers` is
59 |   `TRUE`, a list of such Tensors (one for each hidden layer).
60 | }
61 | \description{
62 | Multi-headed, multi-layer Transformer from "Attention is All You Need". This
63 | is almost an exact implementation of the original Transformer encoder.
64 | }
65 | \details{
66 | See the original paper: \url{https://arxiv.org/abs/1706.03762}
67 | 
68 | Also see:
69 | \url{https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py}
70 | }
71 | \examples{
72 | \dontrun{
73 | batch_size <- 10
74 | seq_length <- 500
75 | hidden_size <- 120
76 | 
77 | with(tensorflow::tf$variable_scope("examples",
78 |   reuse = tensorflow::tf$AUTO_REUSE
79 | ), {
80 |   input_tensor <- tensorflow::tf$get_variable("input",
81 |     shape = c(
82 |       batch_size,
83 |       seq_length,
84 |       hidden_size
85 |     )
86 |   )
87 | })
88 | 
89 | model_t <- transformer_model(
90 |   input_tensor = input_tensor,
91 |   hidden_size = hidden_size
92 | )
93 | }
94 | }
95 | 


--------------------------------------------------------------------------------
/man/transpose_for_scores.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeling.R
 3 | \name{transpose_for_scores}
 4 | \alias{transpose_for_scores}
 5 | \title{Reshape and transpose tensor}
 6 | \usage{
 7 | transpose_for_scores(
 8 |   input_tensor,
 9 |   batch_size,
10 |   num_attention_heads,
11 |   seq_length,
12 |   width
13 | )
14 | }
15 | \arguments{
16 | \item{input_tensor}{Tensor to reshape and transpose.}
17 | 
18 | \item{batch_size}{Size of the first dimension of input_tensor.}
19 | 
20 | \item{num_attention_heads}{Size of the third dimension of input_tensor. (Will
21 | be transposed to second dimension.)}
22 | 
23 | \item{seq_length}{Size of the second dimension of input_tensor. (Will be
24 | transposed to third dimension.)}
25 | 
26 | \item{width}{Size of fourth dimension of input_tensor.}
27 | }
28 | \value{
29 | Tensor of shape: batch_size, num_attention_heads, seq_length, width.
30 | }
31 | \description{
32 | In Python code, this is internal to attention_layer. Pulling it out into
33 | separate function here.
34 | }
35 | \keyword{internal}
36 | 


--------------------------------------------------------------------------------
/man/truncate_seq_pair.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_classifier.R
 3 | \name{truncate_seq_pair}
 4 | \alias{truncate_seq_pair}
 5 | \title{Truncate a sequence pair to the maximum length.}
 6 | \usage{
 7 | truncate_seq_pair(tokens_a, tokens_b, max_length)
 8 | }
 9 | \arguments{
10 | \item{tokens_a}{Character; a vector of tokens in the first input sequence.}
11 | 
12 | \item{tokens_b}{Character; a vector of tokens in the second input sequence.}
13 | 
14 | \item{max_length}{Integer; the maximum total length of the two sequences.}
15 | }
16 | \value{
17 | A list containing two character vectors: trunc_a and trunc_b.
18 | }
19 | \description{
20 | Truncates a sequence pair to the maximum length.
21 | This is a simple heuristic which will always truncate the longer sequence one
22 | token at a time (or the first sequence in case of a tie -JDB). This makes
23 | more sense than truncating an equal percent of tokens from each, since if one
24 | sequence is very short then each token that's truncated likely contains more
25 | information than a longer sequence.
26 | }
27 | \details{
28 | The python code truncated the sequences in place, using the pass-by-reference
29 | functionality of python. In R, we return the truncated sequences in a list.
30 | }
31 | \examples{
32 | \dontrun{
33 | tokens_a <- c("a", "b", "c", "d")
34 | tokens_b <- c("w", "x", "y", "z")
35 | truncate_seq_pair(tokens_a, tokens_b, 5)
36 | }
37 | }
38 | 


--------------------------------------------------------------------------------
/man/whitespace_tokenize.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tokenization.R
 3 | \name{whitespace_tokenize}
 4 | \alias{whitespace_tokenize}
 5 | \title{Run basic whitespace cleaning and splitting on a piece of text.}
 6 | \usage{
 7 | whitespace_tokenize(text)
 8 | }
 9 | \arguments{
10 | \item{text}{Character scalar to tokenize.}
11 | }
12 | \value{
13 | Character vector of tokens.
14 | }
15 | \description{
16 | Run basic whitespace cleaning and splitting on a piece of text.
17 | }
18 | \examples{
19 | whitespace_tokenize(text = " some\ttext \n with  whitespace ")
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(RBERT)
3 | 
4 | test_check("RBERT")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/attention_probs.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/attention_probs.rds


--------------------------------------------------------------------------------
/tests/testthat/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "num_hidden_layers": 12,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522
13 | }
14 | 


--------------------------------------------------------------------------------
/tests/testthat/sample_amap.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/sample_amap.rds


--------------------------------------------------------------------------------
/tests/testthat/sample_feat_in.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/sample_feat_in.rds


--------------------------------------------------------------------------------
/tests/testthat/sample_feats.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/sample_feats.rds


--------------------------------------------------------------------------------
/tests/testthat/setup.R:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | checkpoint_main_dir <- tempdir()
16 | 
17 | # We need the checkpoint to be available for the other tests, so "download" it
18 | # here. We use a mock function for the part that does the actual downloading,
19 | # and instead copy from tests/testthat/test_checkpoints.
20 | 
21 | # First we need to check if the user has bert_base_uncased.zip. If they don't,
22 | # they still have to download that one.
23 | 
24 | print("Setting up test checkpoint.")
25 | if (!file.exists("test_checkpoints/bert_base_uncased.zip")) {
26 |   destfile <- normalizePath(
27 |     "test_checkpoints/bert_base_uncased.zip",
28 |     mustWork = FALSE
29 |   )
30 | 
31 |   status <- utils::download.file(
32 |     url = "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip",
33 |     destfile = destfile,
34 |     method = "libcurl"
35 |   )
36 | }
37 | 
38 | dont_download_checkpoint <- function(url, checkpoint_zip_path) {
39 |   root_dir <- "test_checkpoints"
40 | 
41 |   from_file <- switch(
42 |     url,
43 |     "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip" = "bert_base_uncased.zip",
44 |     "https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/tensorflow_models/scibert_scivocab_uncased.tar.gz" = "testing_checkpoint.tar.gz"
45 |   )
46 | 
47 |   from_path <- file.path(root_dir, from_file)
48 | 
49 |   file.copy(
50 |     from = from_path,
51 |     to = checkpoint_zip_path,
52 |     overwrite = TRUE
53 |   )
54 | 
55 |   invisible(TRUE)
56 | }
57 | 
58 | mockery::stub(
59 |   where = download_BERT_checkpoint,
60 |   what = ".download_BERT_checkpoint",
61 |   how = dont_download_checkpoint
62 | )
63 | 
64 | cpdir <- download_BERT_checkpoint(
65 |   model = "bert_base_uncased",
66 |   dir = checkpoint_main_dir
67 | )
68 | 


--------------------------------------------------------------------------------
/tests/testthat/teardown.R:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | unlink(cpdir, recursive = TRUE)
16 | 


--------------------------------------------------------------------------------
/tests/testthat/test-download_checkpoint.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | test_that("download_checkpoint works", {
 17 |   # checkpoint was downloaded in setup.R
 18 |   # Redownloading the checkpoint should occur without incident.
 19 |   new_cpdir <- download_BERT_checkpoint(
 20 |     model = "bert_base_uncased",
 21 |     dir = checkpoint_main_dir
 22 |   )
 23 |   expect_identical(new_cpdir, cpdir)
 24 | 
 25 |   testthat::expect_true(
 26 |     file.exists(file.path(cpdir, "vocab.txt"))
 27 |   )
 28 |   testthat::expect_true(
 29 |     file.exists(file.path(cpdir, "bert_config.json"))
 30 |   )
 31 |   testthat::expect_true(
 32 |     file.exists(file.path(cpdir, "bert_model.ckpt.index"))
 33 |   )
 34 |   testthat::expect_true(
 35 |     file.exists(file.path(cpdir, "bert_model.ckpt.meta"))
 36 |   )
 37 |   testthat::expect_true(
 38 |     file.exists(file.path(cpdir, "bert_model.ckpt.data-00000-of-00001"))
 39 |   )
 40 | })
 41 | 
 42 | test_that("dir chooser works.", {
 43 |   expect_identical(
 44 |     .choose_BERT_dir("fake"), "fake"
 45 |   )
 46 |   temp_dir <- tempdir()
 47 |   testing_dir <- paste0(temp_dir, "/testing")
 48 |   old_dir <- set_BERT_dir(testing_dir)
 49 |   expect_identical(
 50 |     normalizePath(getOption("BERT.dir"), mustWork = FALSE),
 51 |     normalizePath(testing_dir, mustWork = FALSE)
 52 |   )
 53 | 
 54 |   # If I don't send it a dir, first it should try the option.
 55 |   expect_identical(
 56 |     .choose_BERT_dir(NULL),
 57 |     normalizePath(testing_dir, mustWork = FALSE)
 58 |   )
 59 | 
 60 |   # If I don't have an option or a dir, it should use the default.
 61 |   options(BERT.dir = NULL)
 62 |   default_dir <- rappdirs::user_cache_dir("RBERT")
 63 |   expect_identical(
 64 |     .choose_BERT_dir(NULL),
 65 |     default_dir
 66 |   )
 67 | 
 68 |   # Go back to the existing setting.
 69 |   options(BERT.dir = old_dir$BERT.dir)
 70 | 
 71 |   # Get rid of the empty dir.
 72 |   unlink(normalizePath(testing_dir), recursive = TRUE)
 73 | })
 74 | 
 75 | test_that("Can download a cp by url.", {
 76 |   # The auto-generated target dir will be different from the one we saved in, so
 77 |   # move the one we downloaded, attempt to DL without forcing, then move it
 78 |   # back. That should make sure everything is working as expected.
 79 |   target_dir <- file.path(
 80 |     checkpoint_main_dir, "uncased_L-12_H-768_A-12"
 81 |   )
 82 | 
 83 |   # Explicitly create the dir so file.copy can copy recursively.
 84 |   dir.create(target_dir)
 85 | 
 86 |   file.copy(
 87 |     cpdir,
 88 |     target_dir,
 89 |     recursive = TRUE
 90 |   )
 91 | 
 92 |   google_base_url <- "https://storage.googleapis.com/bert_models/"
 93 |   bert_base_uncased_url <- paste0(
 94 |     google_base_url,
 95 |     "2018_10_18/uncased_L-12_H-768_A-12.zip"
 96 |   )
 97 | 
 98 |   expect_warning(
 99 |     expect_identical(
100 |       download_BERT_checkpoint(
101 |         url = bert_base_uncased_url
102 |       ),
103 |       normalizePath(target_dir)
104 |     ),
105 |     NA
106 |   )
107 | 
108 |   unlink(target_dir, recursive = TRUE)
109 | 
110 |   # We also need to test one that has tar-gz.
111 |   scibert_url <- paste0(
112 |     "https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/",
113 |     "tensorflow_models/scibert_scivocab_uncased.tar.gz"
114 |   )
115 |   expect_warning(
116 |     scibert_path <- download_BERT_checkpoint(
117 |       url = scibert_url, dir = checkpoint_main_dir
118 |     ),
119 |     NA
120 |   )
121 | 
122 |   unlink(scibert_path, recursive = TRUE)
123 | })
124 | 
125 | test_that(".has_checkpoint works as expected.", {
126 |   # We don't use this in the "easy" mode anymore, but I want to keep the extra
127 |   # option around (inferring the subdir) until I'm *sure* we don't need it.
128 |   expect_error(
129 |     expect_true(.has_checkpoint(model = "bert_base_uncased")),
130 |     NA
131 |   )
132 | })
133 | 


--------------------------------------------------------------------------------
/tests/testthat/test-extract_features.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | test_that("features and examples routines work", {
 16 |   examples <- list(
 17 |     InputExample_EF(
 18 |       unique_id = 1,
 19 |       text_a = "I saw the branch on the bank.",
 20 |       text_b = "A second sequence of words."
 21 |     ),
 22 |     InputExample_EF(
 23 |       unique_id = 2,
 24 |       text_a = "I saw the branch of the bank."
 25 |     )
 26 |   )
 27 |   # tokenizer <- FullTokenizer("vocab.txt")
 28 |   # saveRDS(tokenizer, here::here("tests", "testthat", "tokenizer.rds"))
 29 |   # tokenizer <- readRDS(here::here("tests", "testthat", "tokenizer.rds"))
 30 |   tokenizer <- readRDS("tokenizer.rds")
 31 |   feat_in <- .convert_single_example_EF(
 32 |     ex_index = 6L,
 33 |     example = examples[[2]],
 34 |     seq_length = 5L,
 35 |     tokenizer = tokenizer
 36 |   )
 37 |   expected_feat_in <- readRDS("sample_feat_in.rds")
 38 |   testthat::expect_identical(feat_in, expected_feat_in)
 39 | 
 40 | 
 41 |   # Run these tests only if checkpoint is found.
 42 |   BERT_PRETRAINED_DIR <- cpdir # from setup.R
 43 | 
 44 |   # Test the ckpt_dir argument here. (Expect no error.)
 45 |   feats <- extract_features(
 46 |     examples = examples,
 47 |     ckpt_dir = BERT_PRETRAINED_DIR,
 48 |     batch_size = 2L
 49 |   )
 50 | 
 51 |   testthat::expect_error(
 52 |     extract_features(
 53 |       examples = examples,
 54 |       batch_size = 2L
 55 |     ),
 56 |     "ckpt_dir"
 57 |   )
 58 | 
 59 |   # We should get the same thing if we specify by model instead.
 60 |   feats2 <- extract_features(
 61 |     examples = examples,
 62 |     model = "bert_base_uncased",
 63 |     batch_size = 2L
 64 |   )
 65 | 
 66 |   expect_identical(feats2, feats)
 67 |   rm(feats2)
 68 | 
 69 |   # Also make sure it fails if they don't have the model.
 70 |   expect_error(
 71 |     extract_features(
 72 |       examples = examples,
 73 |       model = "bert_base_cased"
 74 |     ),
 75 |     "Specify ckpt_dir"
 76 |   )
 77 | 
 78 |   vocab_file <- file.path(BERT_PRETRAINED_DIR, "vocab.txt")
 79 |   init_checkpoint <- file.path(BERT_PRETRAINED_DIR, "bert_model.ckpt")
 80 |   # Checkpoint "path" is actually only a stub filename; add ".index" to
 81 |   # check for a specific file.
 82 |   testthat::skip_if_not(file.exists(paste0(
 83 |     init_checkpoint,
 84 |     ".index"
 85 |   )),
 86 |   message = "Checkpoint index not found; skipping test."
 87 |   )
 88 | 
 89 |   bert_config_file <- file.path(BERT_PRETRAINED_DIR, "bert_config.json")
 90 | 
 91 |   # Each token should be repeated 4 times (once for each of the 4 layers
 92 |   # requested by default). I'm sure there's a better way to do this, but this
 93 |   # works for these sentences.
 94 |   tokens <- unlist(
 95 |     c(
 96 |       "[CLS]",
 97 |       tolower(stringr::str_extract_all(
 98 |         examples[[1]]$text_a,
 99 |         "\\b[^\\s]+\\b",
100 |         simplify = TRUE
101 |       )),
102 |       ".",
103 |       "[SEP]",
104 |       tolower(stringr::str_extract_all(
105 |         examples[[1]]$text_b,
106 |         "\\b[^\\s]+\\b",
107 |         simplify = TRUE
108 |       )),
109 |       ".",
110 |       "[SEP]",
111 |       "[CLS]",
112 |       tolower(stringr::str_extract_all(
113 |         examples[[1]]$text_a,
114 |         "\\b[^\\s]+\\b",
115 |         simplify = TRUE
116 |       )),
117 |       ".",
118 |       "[SEP]"
119 |     )
120 |   )
121 | 
122 |   expect_equal(
123 |     sort(unique(feats$output$token)),
124 |     sort(unique(tokens))
125 |   )
126 | 
127 |   # By default we fetch the last 4 layers.
128 |   expect_equal(nrow(feats$output), length(tokens) * 4)
129 |   expect_equal(
130 |     ncol(feats$output),
131 |     5L + 768L
132 |   )
133 |   expect_length(feats, 1)
134 | 
135 |   # Make sure we can grab layer 0 when we want to.
136 |   feats <- extract_features(
137 |     examples = examples,
138 |     vocab_file = vocab_file,
139 |     bert_config_file = bert_config_file,
140 |     init_checkpoint = init_checkpoint,
141 |     batch_size = 2L,
142 |     layer_indexes = -4:0,
143 |     features = c("output", "attention")
144 |   )
145 | 
146 |   expect_length(feats, 2)
147 | 
148 |   # There may be some minor numerical differences across different systems. Need
149 |   # to do a comparison along the lines of dplyr::near. Needed to update these
150 |   # tests for the new format, because some of the layer/token index repeats went
151 |   # away, and thus the sum changed. I tibbled the expected feats and resaved.
152 |   test_feats_flat <- suppressWarnings(as.numeric(unlist(feats$output)))
153 | 
154 |   # expected_feats <- readRDS(
155 |   #   here::here("tests", "testthat", "sample_feats.rds")
156 |   # )
157 |   expected_feats <- readRDS("sample_feats.rds")
158 |   # The sorting changed since I saved an example, so let's put it into the same
159 |   # order as the one we're getting now.
160 |   expected_feats <- dplyr::arrange(
161 |     expected_feats,
162 |     sequence_index,
163 |     layer_index,
164 |     token_index
165 |   )
166 |   expected_feats_flat <- suppressWarnings(as.numeric(unlist(expected_feats)))
167 | 
168 |   tol <- 10^(-5)
169 | 
170 |   # check both the sum and mean relative difference
171 |   rel_diff_sum <- abs(sum(test_feats_flat, na.rm = TRUE) -
172 |     sum(expected_feats_flat, na.rm = TRUE)) /
173 |     (tol + abs(sum(test_feats_flat, na.rm = TRUE) +
174 |       sum(expected_feats_flat, na.rm = TRUE)))
175 |   testthat::expect_lte(rel_diff_sum, tol)
176 | 
177 |   mean_relative_difference <- mean(abs(test_feats_flat - expected_feats_flat) /
178 |     (tol + abs(test_feats_flat +
179 |       expected_feats_flat)),
180 |   na.rm = TRUE
181 |   )
182 | 
183 |   testthat::expect_lte(mean_relative_difference, tol)
184 | 
185 |   test_attn_flat <- suppressWarnings(
186 |     as.numeric(unlist(feats$attention$attention_weight))
187 |   )
188 | 
189 |   # expected_attn <- readRDS(
190 |   #   here::here("tests", "testthat", "attention_probs.rds")
191 |   # )
192 |   expected_attn <- readRDS("attention_probs.rds")
193 |   expected_attn_flat <- suppressWarnings(as.numeric(unlist(expected_attn)))
194 |   expected_attn_flat <- expected_attn_flat[!is.na(expected_attn_flat)]
195 | 
196 |   # The original expected value has rotated matrices relative to the tidy
197 |   # tibble. However, it's unlikely that they'd work out to have sum and mean
198 |   # below within tolerance if they were actually different, so I'm sorting to
199 |   # get a "good enough" evaluation.
200 |   expected_attn_flat <- sort(expected_attn_flat)
201 |   test_attn_flat <- sort(test_attn_flat)
202 | 
203 |   rel_diff_sum <- abs(sum(test_attn_flat, na.rm = TRUE) -
204 |     sum(expected_attn_flat, na.rm = TRUE)) /
205 |     (tol + abs(sum(test_attn_flat, na.rm = TRUE) +
206 |       sum(expected_attn_flat, na.rm = TRUE)))
207 |   testthat::expect_lte(rel_diff_sum, tol)
208 | 
209 |   mean_relative_difference <- mean(abs(test_attn_flat - expected_attn_flat) /
210 |     (tol + abs(test_attn_flat +
211 |       expected_attn_flat)),
212 |   na.rm = TRUE
213 |   )
214 | 
215 |   testthat::expect_lte(mean_relative_difference, tol)
216 | 
217 |   feats <- extract_features(
218 |     examples = examples,
219 |     vocab_file = vocab_file,
220 |     bert_config_file = bert_config_file,
221 |     init_checkpoint = init_checkpoint,
222 |     batch_size = 2L,
223 |     features = "output"
224 |   )
225 |   expect_length(feats, 1)
226 |   expect_is(feats$output, "tbl_df")
227 |   expect_equal(
228 |     colnames(feats$output),
229 |     c(
230 |       "sequence_index", "segment_index", "token_index", "token", "layer_index",
231 |       paste0("V", 1:768)
232 |     )
233 |   )
234 | 
235 |   feats <- extract_features(
236 |     examples = examples,
237 |     vocab_file = vocab_file,
238 |     bert_config_file = bert_config_file,
239 |     init_checkpoint = init_checkpoint,
240 |     batch_size = 2L,
241 |     features = "attention"
242 |   )
243 |   expect_length(feats, 1)
244 |   expect_is(feats$attention, "tbl_df")
245 |   expect_equal(
246 |     colnames(feats$attention),
247 |     c(
248 |       "sequence_index", "token_index", "segment_index", "token",
249 |       "layer_index", "head_index", "attention_token_index",
250 |       "attention_segment_index", "attention_token", "attention_weight"
251 |     )
252 |   )
253 | 
254 |   # works for examples given as character vectors
255 |   text_example1 <- "one"
256 |   text_example2 <- list(c("one", "two"), c("three", "four"))
257 |   text_example3 <- list(list("one", "two"), list("three", "four"))
258 |   lone_example <- make_examples_simple(text_example1)[[1]]
259 | 
260 |   feats1 <- extract_features(
261 |     examples = text_example1,
262 |     model= "bert_base_uncased"
263 |   )
264 |   testthat::expect_equal(dim(feats1$output), c(12L, 773L))
265 | 
266 |   feats1b <- extract_features(
267 |     examples = lone_example,
268 |     model= "bert_base_uncased"
269 |   )
270 |   testthat::expect_identical(feats1, feats1b)
271 | 
272 |   feats2 <- extract_features(
273 |     examples = text_example2,
274 |     model= "bert_base_uncased"
275 |   )
276 |   testthat::expect_equal(dim(feats2$output), c(40L, 773L))
277 | 
278 |   feats3 <- extract_features(
279 |     examples = text_example3,
280 |     model= "bert_base_uncased"
281 |   )
282 |   testthat::expect_identical(feats2, feats3)
283 | 
284 |   # Manual speed tests:
285 |   # emma_lines <- janeaustenr::emma[janeaustenr::emma != ""][5:54]
286 |   # examples <- purrr::imap(
287 |   #   emma_lines,
288 |   #   ~ InputExample_EF(unique_id = .y, text_a = .x)
289 |   # )
290 |   # microbenchmark::microbenchmark(
291 |   #   feats <- extract_features(
292 |   #     examples = examples,
293 |   #     vocab_file = vocab_file,
294 |   #     bert_config_file = bert_config_file,
295 |   #     init_checkpoint = init_checkpoint,
296 |   #     batch_size = 2L,
297 |   #     features = "attention"
298 |   #   ),
299 |   #   times = 1
300 |   # )
301 | })
302 | 
303 | test_that(".get_actual_index works", {
304 |   testthat::expect_error(
305 |     .get_actual_index(index = 0, length = 10),
306 |     "Ambiguous"
307 |   )
308 | 
309 |   testthat::expect_error(
310 |     .get_actual_index(index = 11, length = 10),
311 |     "out of range"
312 |   )
313 | 
314 |   testthat::expect_identical(.get_actual_index(index = -2, length = 10), 9L)
315 | 
316 |   testthat::expect_identical(.get_actual_index(index = 9, length = 10), 9L)
317 | })
318 | 
319 | test_that("make_examples_simple works", {
320 |   text <- c(
321 |     "Here are some words.",
322 |     "Here are some more words."
323 |   )
324 |   input_ex <- make_examples_simple(text)
325 |   testthat::expect_s3_class(input_ex[[1]], "InputExample_EF")
326 | 
327 |   testthat::expect_identical(input_ex[[1]]$text_a, text[[1]])
328 |   testthat::expect_null(input_ex[[1]]$text_b)
329 |   testthat::expect_identical(input_ex[[2]]$text_a, text[[2]])
330 |   testthat::expect_null(input_ex[[2]]$text_b)
331 | })
332 | 
333 | test_that("make_examples_simple works for two-segment examples", {
334 |   text <- list(
335 |     c(
336 |       "First sequence, first segment.",
337 |       "First sequence, second segment."
338 |     ),
339 |     c(
340 |       "Second sequence, first segment.",
341 |       "Second sequence, second segment.",
342 |       "Second sequence, EXTRA segment."
343 |     ),
344 |     "Third sequence, only one segment."
345 |   )
346 |   testthat::expect_warning(
347 |     input_ex <- make_examples_simple(text),
348 |     "ignored"
349 |   )
350 |   testthat::expect_identical(input_ex[[1]]$text_a, text[[1]][[1]])
351 |   testthat::expect_identical(input_ex[[1]]$text_b, text[[1]][[2]])
352 |   testthat::expect_identical(input_ex[[2]]$text_a, text[[2]][[1]])
353 |   testthat::expect_identical(input_ex[[2]]$text_b, text[[2]][[2]])
354 |   testthat::expect_identical(input_ex[[3]]$text_a, text[[3]])
355 |   testthat::expect_null(input_ex[[3]]$text_b)
356 | })
357 | 


--------------------------------------------------------------------------------
/tests/testthat/test-modeling.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | test_that("The BertConfig routines work", {
 17 |   config <- BertConfig(vocab_size = 30522L)
 18 |   expected_config <- list(
 19 |     "vocab_size" = 30522L,
 20 |     "hidden_size" = 768L,
 21 |     "num_hidden_layers" = 12L,
 22 |     "num_attention_heads" = 12L,
 23 |     "hidden_act" = "gelu",
 24 |     "intermediate_size" = 3072L,
 25 |     "hidden_dropout_prob" = 0.1,
 26 |     "attention_probs_dropout_prob" = 0.1,
 27 |     "max_position_embeddings" = 512L,
 28 |     "type_vocab_size" = 16L,
 29 |     "initializer_range" = 0.02
 30 |   )
 31 |   testthat::expect_is(config, "BertConfig")
 32 |   testthat::expect_identical(names(config), names(expected_config))
 33 | 
 34 |   json_file <- "bert_config.json"
 35 |   json_config <- bert_config_from_json_file(json_file)
 36 | 
 37 |   testthat::expect_is(json_config, "BertConfig")
 38 |   testthat::expect_identical(names(json_config), names(expected_config))
 39 | })
 40 | 
 41 | test_that("The BertModel routine works", {
 42 |   with(tensorflow::tf$variable_scope("tests",
 43 |     reuse = tensorflow::tf$AUTO_REUSE
 44 |   ), {
 45 |     input_ids <- tensorflow::tf$constant(list(
 46 |       list(31L, 51L, 99L),
 47 |       list(15L, 5L, 0L)
 48 |     ))
 49 | 
 50 |     input_mask <- tensorflow::tf$constant(list(
 51 |       list(1L, 1L, 1L),
 52 |       list(1L, 1L, 0L)
 53 |     ))
 54 |     token_type_ids <- tensorflow::tf$constant(list(
 55 |       list(0L, 0L, 1L),
 56 |       list(0L, 2L, 0L)
 57 |     ))
 58 |     config <- BertConfig(
 59 |       vocab_size = 32000L,
 60 |       hidden_size = 768L,
 61 |       num_hidden_layers = 8L,
 62 |       num_attention_heads = 12L,
 63 |       intermediate_size = 1024L
 64 |     )
 65 |     model_train <- BertModel(
 66 |       config = config,
 67 |       is_training = TRUE,
 68 |       input_ids = input_ids,
 69 |       input_mask = input_mask,
 70 |       token_type_ids = token_type_ids
 71 |     )
 72 |     model <- BertModel(
 73 |       config = config,
 74 |       is_training = FALSE,
 75 |       input_ids = input_ids,
 76 |       input_mask = NULL,
 77 |       token_type_ids = NULL
 78 |     )
 79 |   })
 80 |   testthat::expect_is(model, "BertModel")
 81 |   testthat::expect_is(
 82 |     model$embedding_output,
 83 |     "tensorflow.python.framework.ops.Tensor"
 84 |   )
 85 |   testthat::expect_is(
 86 |     model$embedding_table,
 87 |     "tensorflow.python.ops.variables.RefVariable"
 88 |   )
 89 |   testthat::expect_is(
 90 |     model$sequence_output,
 91 |     "tensorflow.python.framework.ops.Tensor"
 92 |   )
 93 |   testthat::expect_is(
 94 |     model$pooled_output,
 95 |     "tensorflow.python.framework.ops.Tensor"
 96 |   )
 97 |   testthat::expect_is(
 98 |     model$all_encoder_layers[[1]],
 99 |     "tensorflow.python.framework.ops.Tensor"
100 |   )
101 | 
102 |   # dropout should only be applied in training!
103 |   testthat::expect_true(grepl(
104 |     pattern = "dropout",
105 |     model_train$embedding_output$op$name
106 |   ))
107 |   testthat::expect_false(grepl(
108 |     pattern = "dropout",
109 |     model$embedding_output$op$name
110 |   ))
111 | })
112 | 
113 | 
114 | test_that("gelu works", {
115 |   with(
116 |     tensorflow::tf$variable_scope("tests",
117 |       reuse = tensorflow::tf$AUTO_REUSE
118 |     ),
119 |     tfx <- tensorflow::tf$get_variable("tfx", tensorflow::shape(10L))
120 |   )
121 |   tgelu <- gelu(tfx)
122 |   testthat::expect_is(tgelu, "tensorflow.python.framework.ops.Tensor")
123 |   testthat::expect_identical(tgelu$shape$as_list(), 10L)
124 | })
125 | 
126 | test_that("get_activation works", {
127 |   testthat::expect_identical(get_activation("gelu"), gelu)
128 |   testthat::expect_equal(
129 |     get_activation("relu"),
130 |     tensorflow::tf$nn$relu
131 |   )
132 |   testthat::expect_equal(
133 |     get_activation("tanh"),
134 |     tensorflow::tf$tanh
135 |   )
136 |   testthat::expect_true(is.na(get_activation("linear")))
137 | })
138 | 
139 | test_that("get_assignment_map_from_checkpoint works", {
140 |   # Create a "model" with a couple variables that overlap some variable names in
141 |   # the BERT checkpoint. (The actual variables aren't compatible with the
142 |   # checkpoint.) The BERT checkpoint is large, and won't be included in repo. A
143 |   # checkpoint is downloaded as part of test setup. Run this test only if the
144 |   # checkpoint can be found.
145 | 
146 |   init_checkpoint <- file.path(
147 |     cpdir,
148 |     "bert_model.ckpt"
149 |   )
150 | 
151 |   # Checkpoint "path" is actually only a stub filename; add ".index" to
152 |   # check for a specific file.
153 |   testthat::skip_if_not(file.exists(paste0(
154 |     init_checkpoint,
155 |     ".index"
156 |   )),
157 |   message = "Checkpoint index not found; skipping test."
158 |   )
159 | 
160 |   with(tensorflow::tf$variable_scope("bert",
161 |     reuse = tensorflow::tf$AUTO_REUSE
162 |   ), {
163 |     test_ten1 <- tensorflow::tf$get_variable(
164 |       "encoder/layer_9/output/dense/bias",
165 |       shape = c(1L, 2L, 3L)
166 |     )
167 |     test_ten2 <- tensorflow::tf$get_variable(
168 |       "encoder/layer_9/output/dense/kernel",
169 |       shape = c(1L, 2L, 3L)
170 |     )
171 |   })
172 |   tvars <- tensorflow::tf$get_collection(
173 |     tensorflow::tf$GraphKeys$GLOBAL_VARIABLES
174 |   )
175 | 
176 |   amap <- get_assignment_map_from_checkpoint(tvars, init_checkpoint)
177 |   expected_result <- readRDS("sample_amap.rds")
178 |   testthat::expect_identical(amap, expected_result)
179 | })
180 | 
181 | 
182 | test_that("dropout works", {
183 |   with(
184 |     tensorflow::tf$variable_scope("tests",
185 |       reuse = tensorflow::tf$AUTO_REUSE
186 |     ),
187 |     todrop <- tensorflow::tf$get_variable(
188 |       "todrop",
189 |       tensorflow::shape(10L, 20L)
190 |     )
191 |   )
192 |   dropped <- dropout(todrop, 0.3)
193 |   testthat::expect_is(dropped, "tensorflow.python.framework.ops.Tensor")
194 |   testthat::expect_true(grepl(pattern = "dropout", dropped$op$name))
195 | })
196 | 
197 | test_that("layer_norm works", {
198 |   with(
199 |     tensorflow::tf$variable_scope("tests",
200 |       reuse = tensorflow::tf$AUTO_REUSE
201 |     ),
202 |     lnorm <- tensorflow::tf$get_variable("lnorm", tensorflow::shape(10L))
203 |   )
204 |   normed <- layer_norm(lnorm)
205 |   testthat::expect_is(normed, "tensorflow.python.framework.ops.Tensor")
206 |   testthat::expect_true(grepl(pattern = "LayerNorm", normed$op$name))
207 | })
208 | 
209 | test_that("layer_norm_and_dropout works", {
210 |   with(
211 |     tensorflow::tf$variable_scope("tests",
212 |       reuse = tensorflow::tf$AUTO_REUSE
213 |     ),
214 |     lndr <- tensorflow::tf$get_variable("lndr", tensorflow::shape(10L))
215 |   )
216 |   normed_and_dropped <- layer_norm_and_dropout(lndr, dropout_prob = 0.5)
217 |   testthat::expect_is(
218 |     normed_and_dropped,
219 |     "tensorflow.python.framework.ops.Tensor"
220 |   )
221 |   testthat::expect_true(grepl(pattern = "dropout", normed_and_dropped$op$name))
222 | })
223 | 
224 | test_that("create_initializer works", {
225 |   init <- create_initializer()
226 |   testthat::expect_is(init, "tensorflow.python.ops.init_ops.TruncatedNormal")
227 | })
228 | 
229 | test_that("embedding_lookup works", {
230 |   with(tensorflow::tf$variable_scope("tests",
231 |     reuse = tensorflow::tf$AUTO_REUSE
232 |   ), {
233 |     ids <- tensorflow::tf$get_variable("ids",
234 |       dtype = "int32",
235 |       shape = tensorflow::shape(10, 20)
236 |     )
237 |     el <- embedding_lookup(ids,
238 |       vocab_size = 100L,
239 |       word_embedding_name = "some_name"
240 |     )
241 |   })
242 |   testthat::expect_is(el[[1]], "tensorflow.python.framework.ops.Tensor")
243 |   testthat::expect_is(el[[2]], "tensorflow.python.ops.variables.RefVariable")
244 | })
245 | 
246 | test_that("embedding_postprocessor works", {
247 |   batch_size <- 10
248 |   seq_length <- 512
249 |   embedding_size <- 200
250 |   with(tensorflow::tf$variable_scope("tests",
251 |     reuse = tensorflow::tf$AUTO_REUSE
252 |   ), {
253 |     input_tensor <- tensorflow::tf$get_variable(
254 |       "input_epp",
255 |       dtype = "float",
256 |       shape = tensorflow::shape(batch_size, seq_length, embedding_size)
257 |     )
258 |     token_type_ids <- tensorflow::tf$get_variable(
259 |       "ids_epp",
260 |       dtype = "int32",
261 |       shape = tensorflow::shape(batch_size, seq_length)
262 |     )
263 | 
264 |     pp_embed <- embedding_postprocessor(input_tensor,
265 |       use_token_type = TRUE,
266 |       token_type_ids = token_type_ids
267 |     )
268 |   })
269 |   testthat::expect_is(pp_embed, "tensorflow.python.framework.ops.Tensor")
270 |   testthat::expect_true(grepl(pattern = "dropout", pp_embed$op$name))
271 | })
272 | 
273 | test_that("create_attention_mask_from_input_mask works", {
274 |   with(tensorflow::tf$variable_scope("tests",
275 |     reuse = tensorflow::tf$AUTO_REUSE
276 |   ), {
277 |     from_tensor <- ids <- tensorflow::tf$get_variable(
278 |       "ften",
279 |       dtype = "float",
280 |       shape = tensorflow::shape(10, 20)
281 |     )
282 |     to_mask <- ids <- tensorflow::tf$get_variable(
283 |       "mask",
284 |       dtype = "int32",
285 |       shape = tensorflow::shape(10, 30)
286 |     )
287 |     amask <- create_attention_mask_from_input_mask(from_tensor, to_mask)
288 |   })
289 |   testthat::expect_is(amask, "tensorflow.python.framework.ops.Tensor")
290 |   testthat::expect_identical(amask$shape$as_list(), c(10L, 20L, 30L))
291 | })
292 | 
293 | test_that("transformer_model works", {
294 |   batch_size <- 10
295 |   seq_length <- 500
296 |   hidden_size <- 120
297 |   num_hidden <- 7
298 | 
299 |   with(tensorflow::tf$variable_scope("tests",
300 |     reuse = tensorflow::tf$AUTO_REUSE
301 |   ), {
302 |     input_tensor <- tensorflow::tf$get_variable("input_tm",
303 |       shape = c(
304 |         batch_size,
305 |         seq_length,
306 |         hidden_size
307 |       )
308 |     )
309 |     model_t <- transformer_model(
310 |       input_tensor = input_tensor,
311 |       hidden_size = hidden_size,
312 |       num_hidden_layers = num_hidden,
313 |       do_return_all_layers = TRUE
314 |     )
315 |   })
316 |   # ATTN: modified below to account for attention_data
317 |   attention_data <- model_t$attention_data
318 |   testthat::expect_equal(length(attention_data), num_hidden)
319 |   testthat::expect_is(
320 |     attention_data[[num_hidden]],
321 |     "tensorflow.python.framework.ops.Tensor"
322 |   )
323 |   model_t <- model_t$final_outputs
324 |   # ATTN: modified above to account for attention_data
325 | 
326 |   testthat::expect_equal(length(model_t), num_hidden)
327 |   testthat::expect_is(
328 |     model_t[[num_hidden]],
329 |     "tensorflow.python.framework.ops.Tensor"
330 |   )
331 | })
332 | 
333 | 
334 | test_that("get_shape_list works", {
335 |   with(tensorflow::tf$variable_scope("tests",
336 |     reuse = tensorflow::tf$AUTO_REUSE
337 |   ), {
338 |     phold <- tensorflow::tf$placeholder(tensorflow::tf$int32,
339 |       shape = tensorflow::shape(4)
340 |     )
341 |     static_shape <- get_shape_list(phold)
342 |     tfunique <- tensorflow::tf$unique(phold)
343 |     tfy <- tfunique$y
344 |     dynamic_shape <- get_shape_list(tfy)
345 |   })
346 |   testthat::expect_identical(static_shape, list(4L))
347 |   testthat::expect_is(
348 |     dynamic_shape[[1]],
349 |     "tensorflow.python.framework.ops.Tensor"
350 |   )
351 | })
352 | 
353 | test_that("reshape to/from matrix functions work", {
354 |   with(
355 |     tensorflow::tf$variable_scope("tests",
356 |       reuse = tensorflow::tf$AUTO_REUSE
357 |     ),
358 |     r3t <- tensorflow::tf$get_variable("r3t",
359 |       dtype = "int32",
360 |       shape = tensorflow::shape(10, 20, 3)
361 |     )
362 |   )
363 |   mat <- reshape_to_matrix(r3t)
364 |   testthat::expect_is(mat, "tensorflow.python.framework.ops.Tensor")
365 |   testthat::expect_identical(mat$shape$as_list(), c(200L, 3L))
366 | 
367 |   ten3 <- reshape_from_matrix(mat, orig_shape_list = list(10L, 20L, 3L))
368 |   testthat::expect_is(ten3, "tensorflow.python.framework.ops.Tensor")
369 |   testthat::expect_identical(ten3$shape$as_list(), c(10L, 20L, 3L))
370 | })
371 | 
372 | test_that("assert_rank works", {
373 |   with(tensorflow::tf$variable_scope("tests",
374 |     reuse = tensorflow::tf$AUTO_REUSE
375 |   ), {
376 |     ten <- tensorflow::tf$get_variable("ten",
377 |       dtype = "int32",
378 |       shape = tensorflow::shape(10)
379 |     )
380 |     testthat::expect_true(assert_rank(ten, 1))
381 |     testthat::expect_true(assert_rank(ten, 1:2))
382 |     testthat::expect_error(assert_rank(ten, 2), "not equal")
383 |   })
384 | })
385 | 


--------------------------------------------------------------------------------
/tests/testthat/test-optimization.R:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | test_that("create_optimizer works", {
17 |   with(tensorflow::tf$variable_scope("tests",
18 |     reuse = tensorflow::tf$AUTO_REUSE
19 |   ), {
20 |     totrain <- tensorflow::tf$get_variable(
21 |       "totrain",
22 |       tensorflow::shape(10L, 20L)
23 |     )
24 |     loss <- 2 * totrain
25 | 
26 |     t_op <- create_optimizer(
27 |       loss = loss,
28 |       init_lr = 0.01,
29 |       num_train_steps = 20L,
30 |       num_warmup_steps = 10L,
31 |       use_tpu = FALSE
32 |     )
33 |   })
34 | 
35 |   testthat::expect_is(
36 |     t_op,
37 |     "tensorflow.python.framework.ops.Operation"
38 |   )
39 | 
40 |   testthat::expect_true(grepl(pattern = "group_deps", t_op$name))
41 | 
42 |   # now actually put some training variables in place...
43 |   with(
44 |     tensorflow::tf$variable_scope("tests",
45 |       reuse = tensorflow::tf$AUTO_REUSE
46 |     ),
47 |     totrain <- tensorflow::tf$get_variable(
48 |       "totrain",
49 |       tensorflow::shape(10L, 20L)
50 |     )
51 |   )
52 | })
53 | 
54 | 
55 | test_that("AdamWeightDecayOptimizer works", {
56 |   with(tensorflow::tf$variable_scope("tests",
57 |     reuse = tensorflow::tf$AUTO_REUSE
58 |   ), {
59 |     awd_opt <- AdamWeightDecayOptimizer(learning_rate = 0.01)
60 |   })
61 | 
62 |   testthat::expect_is(
63 |     awd_opt,
64 |     "AdamWeightDecayOptimizer"
65 |   )
66 |   testthat::expect_is(
67 |     awd_opt,
68 |     "tensorflow.python.training.optimizer.Optimizer"
69 |   )
70 |   # after our hack, `apply_gradients` is a function, not a method.
71 |   testthat::expect_is(
72 |     awd_opt$apply_gradients,
73 |     "python.builtin.function"
74 |   )
75 | })
76 | 


--------------------------------------------------------------------------------
/tests/testthat/test-run_classifier.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | test_that("truncate_seq_pair works", {
 17 |   tokens_a <- c("a", "b", "c", "d")
 18 |   tokens_b <- c("w", "x", "y", "z")
 19 |   trunc_seq <- truncate_seq_pair(tokens_a, tokens_b, 5)
 20 |   expected_result <- list(
 21 |     trunc_a = c("a", "b", "c"),
 22 |     trunc_b = c("w", "x")
 23 |   )
 24 |   testthat::expect_identical(trunc_seq, expected_result)
 25 | })
 26 | 
 27 | test_that("create_model works", {
 28 |   with(tensorflow::tf$variable_scope("tests_class1",
 29 |     reuse = tensorflow::tf$AUTO_REUSE
 30 |   ), {
 31 |     input_ids <- tensorflow::tf$constant(list(
 32 |       list(31L, 51L, 99L),
 33 |       list(15L, 5L, 0L)
 34 |     ))
 35 | 
 36 |     input_mask <- tensorflow::tf$constant(list(
 37 |       list(1L, 1L, 1L),
 38 |       list(1L, 1L, 0L)
 39 |     ))
 40 |     token_type_ids <- tensorflow::tf$constant(list(
 41 |       list(0L, 0L, 1L),
 42 |       list(0L, 2L, 0L)
 43 |     ))
 44 |     config <- BertConfig(
 45 |       vocab_size = 32000L,
 46 |       hidden_size = 768L,
 47 |       num_hidden_layers = 8L,
 48 |       type_vocab_size = 2L,
 49 |       num_attention_heads = 12L,
 50 |       intermediate_size = 1024L
 51 |     )
 52 |     class_model <- create_model(
 53 |       bert_config = config,
 54 |       is_training = TRUE,
 55 |       input_ids = input_ids,
 56 |       input_mask = input_mask,
 57 |       segment_ids = token_type_ids,
 58 |       labels = c(1L, 2L),
 59 |       num_labels = 2L
 60 |     )
 61 |   })
 62 |   testthat::expect_is(
 63 |     class_model$loss,
 64 |     "tensorflow.python.framework.ops.Tensor"
 65 |   )
 66 |   testthat::expect_is(
 67 |     class_model$per_example_loss,
 68 |     "tensorflow.python.framework.ops.Tensor"
 69 |   )
 70 |   testthat::expect_is(
 71 |     class_model$logits,
 72 |     "tensorflow.python.framework.ops.Tensor"
 73 |   )
 74 |   testthat::expect_is(
 75 |     class_model$probabilities,
 76 |     "tensorflow.python.framework.ops.Tensor"
 77 |   )
 78 | 
 79 |   testthat::expect_true(grepl(
 80 |     pattern = "Mean",
 81 |     class_model$loss$op$name
 82 |   ))
 83 |   testthat::expect_true(grepl(
 84 |     pattern = "Neg",
 85 |     class_model$per_example_loss$op$name
 86 |   ))
 87 |   testthat::expect_true(grepl(
 88 |     pattern = "BiasAdd",
 89 |     class_model$logits$op$name
 90 |   ))
 91 |   testthat::expect_true(grepl(
 92 |     pattern = "Softmax",
 93 |     class_model$probabilities$op$name
 94 |   ))
 95 | })
 96 | 
 97 | test_that("model_fn_builder works", {
 98 |   # Run this test only if the checkpoint can be found.
 99 |   init_checkpoint <- file.path(
100 |     cpdir, # from setup.R
101 |     "bert_model.ckpt"
102 |   )
103 | 
104 |   # Checkpoint "path" is actually only a stub filename; add ".index" to
105 |   # check for a specific file.
106 |   testthat::skip_if_not(file.exists(paste0(
107 |     init_checkpoint,
108 |     ".index"
109 |   )),
110 |   message = "Checkpoint index not found; skipping test."
111 |   )
112 |   with(tensorflow::tf$variable_scope("tests_class2",
113 |     reuse = tensorflow::tf$AUTO_REUSE
114 |   ), {
115 |     input_ids <- tensorflow::tf$constant(list(
116 |       list(31L, 51L, 99L),
117 |       list(15L, 5L, 0L)
118 |     ))
119 | 
120 |     input_mask <- tensorflow::tf$constant(list(
121 |       list(1L, 1L, 1L),
122 |       list(1L, 1L, 0L)
123 |     ))
124 |     token_type_ids <- tensorflow::tf$constant(list(
125 |       list(0L, 0L, 1L),
126 |       list(0L, 2L, 0L)
127 |     ))
128 |     config <- BertConfig(
129 |       vocab_size = 30522L,
130 |       hidden_size = 768L,
131 |       num_hidden_layers = 8L,
132 |       type_vocab_size = 2L,
133 |       num_attention_heads = 12L,
134 |       intermediate_size = 3072L
135 |     )
136 | 
137 |     test_mod_fn <- model_fn_builder(
138 |       bert_config = config,
139 |       num_labels = 2L,
140 |       init_checkpoint = init_checkpoint,
141 |       learning_rate = 0.01,
142 |       num_train_steps = 20L,
143 |       num_warmup_steps = 10L,
144 |       use_tpu = FALSE
145 |     )
146 |     # After we implement InputFeatures class, come back and add tests for
147 |     # `test_mod_fn`. Something like this, but better:
148 |     # features <- list()
149 |     # features$input_ids <- input_ids
150 |     # features$input_mask <- input_mask
151 |     # features$segment_ids <- token_type_ids
152 |     # features$label_ids <- c(1L, 2L)
153 | 
154 |     # mod_fn_output <- test_mod_fn(features = features,
155 |     #             labels = NULL,
156 |     #             mode = "train",
157 |     #             params = NULL)
158 |   })
159 |   # This isn't much of a test, but it does confirm that the maker function
160 |   # ran, which is non-trivial.
161 |   testthat::expect_is(test_mod_fn, "function")
162 | })
163 | 
164 | test_that("Examples/features creation routines work", {
165 |   tokenizer <- FullTokenizer("vocab.txt")
166 |   input_ex1 <- InputExample(
167 |     guid = 1L,
168 |     text_a = "Some text to classify.",
169 |     text_b = "More wordy words.",
170 |     label = "good"
171 |   )
172 | 
173 |   testthat::expect_is(input_ex1, "InputExample")
174 |   testthat::expect_identical(
175 |     names(input_ex1),
176 |     c("guid", "text_a", "text_b", "label")
177 |   )
178 |   input_ex2 <- InputExample(
179 |     guid = 2L,
180 |     text_a = "This is another example.",
181 |     text_b = "So many words.",
182 |     label = "bad"
183 |   )
184 |   feat <- convert_examples_to_features(
185 |     examples = list(input_ex1, input_ex2),
186 |     label_list = c("good", "bad"),
187 |     max_seq_length = 15L,
188 |     tokenizer = tokenizer
189 |   )
190 |   testthat::expect_identical(length(feat), 2L)
191 |   testthat::expect_is(feat[[1]], "InputFeatures")
192 |   testthat::expect_identical(
193 |     names(feat[[1]]),
194 |     c(
195 |       "input_ids",
196 |       "input_mask",
197 |       "segment_ids",
198 |       "label_id",
199 |       "is_real_example"
200 |     )
201 |   )
202 | })
203 | 


--------------------------------------------------------------------------------
/tests/testthat/test-tokenization.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | test_that("the convert token/id functions work", {
 17 |   vocab <- c("token1" = 0, "token2" = 1, "token3" = 2)
 18 |   inv_voc <- names(vocab)
 19 |   names(inv_voc) <- vocab
 20 | 
 21 |   test_result <- convert_tokens_to_ids(vocab, c("token1", "token3"))
 22 |   expected_result <- c("token1" = 0, "token3" = 2)
 23 |   testthat::expect_identical(test_result, expected_result)
 24 | 
 25 |   test_result <- convert_ids_to_tokens(inv_voc, c(1, 3))
 26 |   expected_result <- c("0" = "token1", "2" = "token3")
 27 |   testthat::expect_identical(test_result, expected_result)
 28 | })
 29 | 
 30 | test_that("The FullTokenizer tokenizer works as expected", {
 31 |   f_tokenizer <- FullTokenizer("vocab.txt", TRUE)
 32 |   text <- "\u535A\u00E7anned words; tihs is Silly"
 33 |   test_result <- tokenize(f_tokenizer, text)
 34 |   expected_result <- c(
 35 |     "\u535A", "canned", "words", ";",
 36 |     "ti", "##hs", "is", "silly"
 37 |   )
 38 |   testthat::expect_identical(test_result, expected_result)
 39 | 
 40 |   f_tokenizer <- FullTokenizer("vocab_small.txt", TRUE)
 41 |   text <- "know the unknowable!"
 42 |   test_result <- tokenize(f_tokenizer, text)
 43 |   expected_result <- c(
 44 |     "know", "the", "un", "##know",
 45 |     "##able", "[UNK]"
 46 |   )
 47 |   testthat::expect_identical(test_result, expected_result)
 48 | })
 49 | 
 50 | 
 51 | test_that("Tokenizers handle edge cases correctly", {
 52 |   test_string <- "remove char: \ufffd "
 53 |   vocab <- load_vocab(vocab_file = "vocab.txt")
 54 | 
 55 |   b_tokenizer <- BasicTokenizer(TRUE)
 56 |   test_result <- tokenize(b_tokenizer, text = test_string)
 57 |   expected_result <- c("remove", "char", ":")
 58 |   testthat::expect_identical(test_result, expected_result)
 59 | 
 60 |   wp_tokenizer <- WordpieceTokenizer(vocab, max_input_chars_per_word = 4)
 61 |   test_result <- tokenize(wp_tokenizer, text = "excessively long")
 62 |   expected_result <- c("[UNK]", "long")
 63 |   testthat::expect_identical(test_result, expected_result)
 64 | 
 65 |   expect_identical(
 66 |     load_vocab("vocab0.txt"),
 67 |     integer(0)
 68 |   )
 69 | })
 70 | 
 71 | 
 72 | test_that("whitespace_tokenize splits a string on whitespace", {
 73 |   test_string <- " some\ttext\nwith whitespace "
 74 |   test_result <- whitespace_tokenize(test_string)
 75 |   expected_result <- c("some", "text", "with", "whitespace")
 76 |   testthat::expect_identical(test_result, expected_result)
 77 | })
 78 | 
 79 | 
 80 | 
 81 | test_that("strip_accents replaces accented chars with nearest equivalents", {
 82 |   test_string <- "fa\u00E7ile"
 83 |   test_result <- strip_accents(test_string)
 84 |   expected_result <- "facile"
 85 |   testthat::expect_identical(test_result, expected_result)
 86 | })
 87 | 
 88 | 
 89 | test_that("split_on_punc splits a string before and after punctuation chars", {
 90 |   test_string <- "stop! don't touch that."
 91 |   test_result <- split_on_punc(test_string)
 92 |   expected_result <- c("stop", "!", " don", "'", "t touch that", ".")
 93 |   testthat::expect_identical(test_result, expected_result)
 94 | 
 95 |   test_string <- "!"
 96 |   test_result <- split_on_punc(test_string)
 97 |   expected_result <- c("!")
 98 |   testthat::expect_identical(test_result, expected_result)
 99 | })
100 | 
101 | 
102 | 
103 | 
104 | test_that("is_whitespace correctly classifies characters", {
105 |   # tests from BERT: tokenization_test.py
106 |   testthat::expect_true(is_whitespace(" "))
107 |   testthat::expect_true(is_whitespace("\t"))
108 |   testthat::expect_true(is_whitespace("\r"))
109 |   testthat::expect_true(is_whitespace("\n"))
110 |   testthat::expect_true(is_whitespace("\u00A0")) # non-breaking space
111 | 
112 |   testthat::expect_false(is_whitespace("A"))
113 |   testthat::expect_false(is_whitespace("-"))
114 | })
115 | 
116 | 
117 | test_that("is_control correctly classifies characters", {
118 |   # tests from BERT: tokenization_test.py
119 |   testthat::expect_true(is_control("\u0005")) # 'Enquiry' control character
120 | 
121 |   testthat::expect_false(is_control("A"))
122 |   testthat::expect_false(is_control(" "))
123 |   testthat::expect_false(is_control("\t"))
124 |   testthat::expect_false(is_control("\r"))
125 | })
126 | 
127 | 
128 | 
129 | test_that("is_punctuation correctly classifies characters", {
130 |   # tests from BERT: tokenization_test.py
131 |   testthat::expect_true(is_punctuation("-"))
132 |   testthat::expect_true(is_punctuation("$"))
133 |   testthat::expect_true(is_punctuation("`"))
134 |   testthat::expect_true(is_punctuation("."))
135 | 
136 |   testthat::expect_false(is_punctuation("A"))
137 |   testthat::expect_false(is_punctuation(" "))
138 | })
139 | 
140 | 
141 | test_that("tokenize_text works correctly", {
142 |   text <- c("Who doesn't like tacos?", "Not me!")
143 |   tokens <- tokenize_text(text = text, ckpt_dir = cpdir)
144 |   testthat::expect_identical(length(tokens[[1]]), 10L)
145 |   testthat::expect_identical(length(tokens[[2]]), 5L)
146 | })
147 | 
148 | test_that("check_vocab works correctly", {
149 |   to_check <- c("apple", "appl")
150 |   vcheck <- check_vocab(words = to_check, ckpt_dir = cpdir)
151 |   testthat::expect_identical(vcheck, c(TRUE, FALSE))
152 | })
153 | 


--------------------------------------------------------------------------------
/tests/testthat/test_checkpoints/.gitignore:
--------------------------------------------------------------------------------
1 | bert_base_uncased.zip
2 | 


--------------------------------------------------------------------------------
/tests/testthat/test_checkpoints/testing_checkpoint.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/test_checkpoints/testing_checkpoint.tar.gz


--------------------------------------------------------------------------------
/tests/testthat/tokenizer.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/tokenizer.rds


--------------------------------------------------------------------------------
/tests/testthat/vocab0.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanbratt/RBERT/d32c3a7b2cce0ce4fb93f64eae9e3f7e85cc6158/tests/testthat/vocab0.txt


--------------------------------------------------------------------------------
/tests/testthat/vocab_small.txt:
--------------------------------------------------------------------------------
1 | [PAD]
2 | un
3 | ##know
4 | ##able
5 | the
6 | to
7 | know
8 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/BERT_basics.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "BERT Basics"
  3 | output: 
  4 |   rmarkdown::html_vignette:
  5 |     toc: true
  6 | vignette: >
  7 |   %\VignetteIndexEntry{BERT Basics}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 |  <!-- 
 13 |  Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning. 
 14 | 
 15 |  Licensed under the Apache License, Version 2.0 (the "License");
 16 |  you may not use this file except in compliance with the License.
 17 |  You may obtain a copy of the License at
 18 | 
 19 |      http://www.apache.org/licenses/LICENSE-2.0
 20 | 
 21 |  Unless required by applicable law or agreed to in writing, software
 22 |  distributed under the License is distributed on an "AS IS" BASIS,
 23 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 24 |  See the License for the specific language governing permissions and
 25 |  limitations under the License.
 26 |  -->
 27 | 
 28 | ```{r, include = FALSE}
 29 | knitr::opts_chunk$set(
 30 |   collapse = TRUE,
 31 |   comment = "#>"
 32 | )
 33 | ```
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | If you are not familiar with BERT, I suggest you check out the helpful blog post [here](http://jalammar.github.io/illustrated-bert/), and the resources linked from it.
 41 | 
 42 | Until you have time to do that, this vignette is intended to be a quick, bare-bones introduction to BERT--just enough so that the rest of this package makes sense.
 43 | 
 44 | ## What is BERT?
 45 | 
 46 | For now, think of BERT as a function that takes in text and puts out numbers--a long list of numbers for *each* token (~word) of the input.
 47 | 
 48 | In fact, there is really a family of such functions.
 49 | Google research released the first BERT models in late 2018, and others have followed (as well as many that include slight modifications to the original structure).
 50 | When referring to any results using "BERT", it is important to specify which BERT you're talking about.
 51 | 
 52 | ## The input to BERT
 53 | 
 54 | ### Tokenization
 55 | 
 56 | BERT takes "natural" text as input, with some restrictions.
 57 | The first thing that BERT does is to [tokenize](https://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) the input text.
 58 | Most common words will be tokenized as themselves, but words that are not included in the vocabulary of that particular version of BERT will be tokenized as two or more "word pieces".
 59 | Each character of punctuation also gets its own token.
 60 | Finally, BERT adds a few special delimiter tokens to each piece of input.
 61 | 
 62 | For example, "Who doesn't like tacos?" might be tokenized as:
 63 | 
 64 | `[CLS] who doesn ' t like ta ##cos ? [SEP]`
 65 | 
 66 | The "##" bit indicates that "cos" was originally attached to "ta".
 67 | The word "tacos" was split up this way because that word isn't found in (this version of) BERT's limited vocabulary.
 68 | 
 69 | Any output from BERT is organized in terms of tokens like this.
 70 | 
 71 | ### Sequences
 72 | 
 73 | Current BERT models can process chunks of text that consist of no more than 512 tokens (so the maximum number of words is rather fewer than that in practice). 
 74 | If you have text longer than that, you will need to find some way of splitting it up.
 75 | A natural way of splitting up your text is by individual sentences, if possible.
 76 | In this package (consistently) and in other literature (perhaps less consistently) a chunk of text processed by BERT is referred to as a "sequence".
 77 | So a list of such chunks may be indexed by a "sequence_index", for example.
 78 | 
 79 | ### Segments
 80 | 
 81 | A sequence *may* be divided into two [^segments] "segments".
 82 | This is useful when your particular application calls for two distinct pieces of text to be input (e.g. a model that evaluates the logical compatibility of two statements).
 83 | Note that the sequence as a whole still can't exceed the token limit, so splitting your text into segments is not a way to input longer text.
 84 | In fact, a delimiter token is required to separate the two segments, which counts against the 512 limit, so you'll actually lose a bit of capacity by using segments.
 85 | 
 86 | When possible, it's probably best to use individual sentences as your input sequences (or segments, if you're going that way).
 87 | BERT was trained at the sentence level, and you're less likely to hit the token limit with individual sentences than with, say, paragraphs of text.
 88 | 
 89 | [^segments]: In principle, the input could have any number of segments, but the BERT models are limited to two segments.
 90 | 
 91 | ## The output of BERT
 92 | 
 93 | ### Embeddings 
 94 | 
 95 | One way of thinking about BERT is as a machine for producing context-dependent [embeddings](https://en.wikipedia.org/wiki/Word_embedding).
 96 | Here, an "embedding" is a vector [^vector] that gets associated with each token of the input.
 97 | Useful embeddings will have a number of special properties.
 98 | For example, tokens with similar meanings will have embeddings that are "nearby" in the embedding space.
 99 | 
100 | Static embeddings, such as [word2vec](https://en.wikipedia.org/wiki/Word2vec), have been around for several years.
101 | However, they typically have been unable to distinguish between [homographs](https://en.wikipedia.org/wiki/Homograph), such as "train [teach]" and "train [locomotive]".
102 | More generally, such embeddings are insensitive to word order, sentence structure, and other contextual cues.
103 | 
104 | [^vector]: Think, "point in a high-dimensional space."
105 | 
106 | ### Context-dependent embeddings
107 | 
108 | In contrast, BERT's output can be understood as embedding vectors that *are* appropriately sensitive to the context in which each word is used.
109 | Not only does this make it possible to give homographs their own embeddings, it also allows more subtle differences in meaning and usage to be picked up.
110 | 
111 | BERT outputs an embedding vector for each input token, including the special tokens "[CLS]" and "[SEP]".
112 | The vector for [CLS] can be thought of as "pure context"; it's the embedding of a token that has no intrinsic meaning, but is still sensitive to the context around it.[^CLS]
113 | 
114 | BERT has a layered structure (see next section), and output embedding vectors can be obtained at each layer.
115 | 
116 | [^CLS]: The interpretation of [CLS] is a bit more nuanced than this simple explanation implies.
117 | See [this discussion](https://github.com/google-research/bert/issues/196) for more details.
118 | 
119 | ## The insides of BERT
120 | 
121 | Context-dependence is achieved through the "attention" mechanism.
122 | In very cartoony terms, "attention" provides a way for each token to "choose" (based on training) which of its surrounding tokens to be most influenced by.
123 | BERT consists of multiple sequential layers of attention, with multiple "heads" per layer.
124 | Each head may split its attention across any of the input tokens (including itself).
125 | It may be helpful to picture each token processed by BERT as a many-headed beast, able to attend at each moment to any or all of its neighbors, and modify itself slightly in the next moment based on what it sees.
126 | 
127 | The amount of attention paid by each token, to each token, in each layer and head, can be represented by a weight that is normalized to one for each "attender".
128 | While these weights are not part of the formal output of BERT, it can be instructive to study them to better understand what aspects of language BERT models well.
129 | 
130 | RBERT makes it easy (using the `extract_features` function) to obtain both the attention weights and the token embeddings at each layer of a BERT model.
131 | 


--------------------------------------------------------------------------------
/vignettes/RBERT_intro.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to RBERT"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{Introduction to RBERT}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 |  <!-- 
 11 |  Copyright 2019 Bedford Freeman & Worth Pub Grp LLC DBA Macmillan Learning. 
 12 | 
 13 |  Licensed under the Apache License, Version 2.0 (the "License");
 14 |  you may not use this file except in compliance with the License.
 15 |  You may obtain a copy of the License at
 16 | 
 17 |      http://www.apache.org/licenses/LICENSE-2.0
 18 | 
 19 |  Unless required by applicable law or agreed to in writing, software
 20 |  distributed under the License is distributed on an "AS IS" BASIS,
 21 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 22 |  See the License for the specific language governing permissions and
 23 |  limitations under the License.
 24 |  -->
 25 | 
 26 | ```{r, include = FALSE}
 27 | knitr::opts_chunk$set(
 28 |   collapse = TRUE,
 29 |   comment = "#>"
 30 | )
 31 | ```
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | RBERT is an implementation of Google Research's 
 38 | [BERT](https://github.com/google-research/bert) in `R`. 
 39 | 
 40 | BERT is a powerful general-purpose language model
 41 | (paper [here](https://arxiv.org/pdf/1810.04805.pdf), 
 42 | helpful blog post [here](http://jalammar.github.io/illustrated-bert/)).
 43 | BERT is written in Python, using [TensorFlow](https://www.tensorflow.org/).
 44 | An `R` package for TensorFlow already [exists](https://tensorflow.rstudio.com/),
 45 | so the goal of this project is to fully implement BERT in `R` down to the level
 46 | of the TensorFlow API.
 47 | 
 48 | Generally speaking, there are three levels at which BERT could be used:
 49 | 
 50 | 1. Using the output of a pre-trained BERT model as features for downstream model
 51 | 2. Fine-tuning on top of a pre-trained BERT model
 52 | 3. Training a BERT model from scratch
 53 | 
 54 | Currently, RBERT is functional at the first level, and possibly functional
 55 | at the second level (speed becomes a significant consideration at this level).
 56 | 
 57 | # Getting started with RBERT
 58 | 
 59 | RBERT requires the tensorflow package to be installed and working. If that
 60 | requirement is met, using RBERT at the first level is fairly 
 61 | straightforward. 
 62 | 
 63 | ```{r eval = FALSE}
 64 | library(RBERT)
 65 | library(dplyr)
 66 | 
 67 | # Download pre-trained BERT model. This will go to an appropriate cache
 68 | # directory by default.
 69 | BERT_PRETRAINED_DIR <- RBERT::download_BERT_checkpoint(
 70 |   model = "bert_base_uncased"
 71 | )
 72 | 
 73 | text_to_process <- c("Impulse is equal to the change in momentum.",
 74 |                      "Changing momentum requires an impulse.",
 75 |                      "An impulse is like a push.",
 76 |                      "Impulse is force times time.")
 77 | 
 78 | # Or make two-segment examples:
 79 | text_to_process2 <- list(c("Impulse is equal to the change in momentum.",
 80 |                            "Changing momentum requires an impulse."),
 81 |                          c("An impulse is like a push.",
 82 |                            "Impulse is force times time."))
 83 | 
 84 | BERT_feats <- extract_features(
 85 |   examples = text_to_process2,
 86 |   ckpt_dir = BERT_PRETRAINED_DIR,
 87 |   layer_indexes = 1:12
 88 | )
 89 | 
 90 | # Extract the final layer output vector for the "[CLS]" token of the first
 91 | # sentence. 
 92 | output_vector1 <- BERT_feats$output %>%
 93 |   dplyr::filter(
 94 |     sequence_index == 1, 
 95 |     token == "[CLS]", 
 96 |     layer_index == 12
 97 |   ) %>% 
 98 |   dplyr::select(dplyr::starts_with("V")) %>% 
 99 |   unlist()
100 | output_vector1
101 | 
102 | # Extract output vectors for all sentences...
103 | # These vectors can be used as input features for downstream models.
104 | # Convenience functions for doing this extraction will be added to the
105 | # package in the near future.
106 | output_vectors <- BERT_feats$output %>% 
107 |   dplyr::filter(token_index == 1, layer_index == 12)
108 | output_vectors
109 | 
110 | ```
111 | 
112 | # Other Functions
113 | 
114 | RBERT exports a couple of functions that may be helpful when exploring BERT.
115 | 
116 | ```{r, eval = FALSE}
117 | # Both of the functions below require a vocabulary (or a checkpoint containing a
118 | # vocab.txt file) to be specified.
119 | BERT_PRETRAINED_DIR <- download_BERT_checkpoint("bert_base_uncased")
120 | 
121 | # `tokenize_text` is a quick way to see the wordpiece tokenization of some text.
122 | tokens <- tokenize_text(text = "Who doesn't like tacos?",
123 |                         ckpt_dir = BERT_PRETRAINED_DIR)
124 | # [[1]]
125 | #  [1] "[CLS]" "who"   "doesn" "'"     "t"     "like"  "ta"    "##cos"
126 | #  [9] "?"     "[SEP]"
127 | 
128 | # `check_vocab` checks whether the given words are found in the vocabulary.
129 | check_vocab(words = c("positron", "electron"), ckpt_dir = BERT_PRETRAINED_DIR)
130 | # [1] FALSE  TRUE
131 | ```
132 | 
133 | 
134 | # Future work
135 | 
136 | There's still a lot to do! Check out the 
137 | [issues board](https://github.com/macmillanhighered/RBERT/issues) 
138 | on the github page.
139 | 


--------------------------------------------------------------------------------