├── LICENSE
├── LICENSE-MODEL.md
├── MANIFEST.in
├── README.md
├── docs
    ├── PROMPTBOOK.md
    ├── model_card.md
    └── source
    │   └── img
    │       ├── logo.png
    │       └── logo_black.png
├── galai
    ├── __init__.py
    ├── model.py
    ├── notebook_utils.py
    ├── parallel_policy.py
    └── utils.py
├── notebooks
    ├── Introduction to Galactica Models.ipynb
    └── Introduction to Galactica Models.pdf
├── requirements.txt
└── setup.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MODEL.md:
--------------------------------------------------------------------------------
  1 | # Creative Commons Attribution-NonCommercial 4.0 International Public License
  2 | 
  3 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
  4 | 
  5 | ## Section 1 – Definitions.
  6 | 
  7 | a. Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
  8 | 
  9 | b. Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.
 10 | 
 11 | c. Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not d. Copyright and Similar Rights.
 12 | 
 13 | d. Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
 14 | 
 15 | e. Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
 16 | Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
 17 | 
 18 | f. Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
 19 | 
 20 | g. Licensor means the individual(s) or entity(ies) granting rights under this Public License.
 21 | 
 22 | i. NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
 23 | 
 24 | j. Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
 25 | 
 26 | k. Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
 27 | 
 28 | l. You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
 29 | 
 30 | ## Section 2 – Scope.
 31 | 
 32 | a. License grant.
 33 | 	1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
 34 | 		A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
 35 | 		B. produce, reproduce, and Share Adapted Material for NonCommercial purposes only.
 36 | 
 37 | 	2. Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
 38 | 	3. Term. The term of this Public License is specified in Section 6(a).
 39 | 	4. Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
 40 | 	5. Downstream recipients.
 41 | 		a. Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
 42 | 		b. No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
 43 | 	6. No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
 44 | 
 45 | b. Other rights.
 46 | 
 47 | 1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
 48 | 
 49 | 2. Patent and trademark rights are not licensed under this Public License.
 50 | 
 51 | 3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
 52 | 
 53 | ## Section 3 – License Conditions.
 54 | 
 55 | Your exercise of the Licensed Rights is expressly made subject to the following conditions.
 56 | 
 57 | a. Attribution.
 58 | 
 59 | 1. If You Share the Licensed Material (including in modified form), You must:
 60 | 
 61 | 	A. retain the following if it is supplied by the Licensor with the Licensed Material:
 62 | identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
 63 | 		i) a copyright notice;
 64 | 		ii) a notice that refers to this Public License;
 65 | 		iii) a notice that refers to the disclaimer of warranties;
 66 | 		iv) a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
 67 | 	B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
 68 | 	C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
 69 | 2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
 70 | 3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
 71 | 4. If You Share Adapted Material You produce, the Adapter's License You apply must not prevent recipients of the Adapted Material from complying with this Public License.
 72 | 
 73 | ## Section 4 – Sui Generis Database Rights.
 74 | 
 75 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
 76 | 
 77 | 	a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only;
 78 | 	b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and
 79 | 	c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
 80 | 
 81 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
 82 | 
 83 | ## Section 5 – Disclaimer of Warranties and Limitation of Liability.
 84 | 
 85 | 	a. Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.
 86 | 
 87 | 	b. To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.
 88 | 
 89 | 	c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
 90 | 
 91 | ## Section 6 – Term and Termination.
 92 | 
 93 | a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
 94 | 
 95 | b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
 96 | 
 97 | 	1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
 98 | 	2. upon express reinstatement by the Licensor.
 99 | 
100 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
101 | 
102 | c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
103 | 
104 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
105 | 
106 | ## Section 7 – Other Terms and Conditions.
107 | 
108 | a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
109 | 
110 | b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
111 | 
112 | ## Section 8 – Interpretation.
113 | 
114 | a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
115 | 
116 | b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
117 | 
118 | c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
119 | 
120 | d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <br>
  3 |     <img src="https://github.com/paperswithcode/galai/raw/main/docs/source/img/logo.png#gh-dark-mode-only" width="400"/>
  4 |     <img src="https://github.com/paperswithcode/galai/raw/main/docs/source/img/logo_black.png#gh-light-mode-only" width="400"/>
  5 |     <br>
  6 | <p>
  7 | <p align="center">
  8 |     <a href="https://github.com/paperswithcode/galai/blob/main/LICENSE">
  9 |         <img alt="GitHub" src="https://img.shields.io/github/license/paperswithcode/galai.svg">
 10 |     </a>
 11 |     <a href="https://github.com/paperswithcode/galai/releases">
 12 |         <img alt="GitHub release" src="https://img.shields.io/github/release/paperswithcode/galai.svg">
 13 |     </a>
 14 | </p>
 15 | 
 16 | **GALACTICA** is a general-purpose scientific language model. It is trained on a large corpus of scientific text and data. It can perform scientific NLP tasks at a high level, as well as tasks such as citation prediction, mathematical reasoning, molecular property prediction and protein annotation. More information is available at [galactica.org](https://galactica.org).
 17 | 
 18 | ## Install
 19 | 
 20 | From pip:
 21 |     
 22 | ```bash
 23 | pip install galai
 24 | ```
 25 | 
 26 | From repository:
 27 |     
 28 | ```bash
 29 | pip install git+https://github.com/paperswithcode/galai
 30 | ```
 31 | 
 32 | ## Models
 33 | 
 34 | There are five GALACTICA models available which we detail below:
 35 | 
 36 | |  Size       | Parameters  |
 37 | |:-----------:|:-----------:|
 38 | | `mini`      |    125 M    |
 39 | | `base`      |    1.3 B    |
 40 | | `standard`  |    6.7 B    |
 41 | | `large`     |     30 B    |
 42 | | `huge`      |    120 B    |
 43 | 
 44 | ## Quickstart
 45 | 
 46 | ```python
 47 | import galai as gal
 48 | 
 49 | model = gal.load_model("standard")
 50 | model.generate("Scaled dot product attention:\n\n\\[")
 51 | # Scaled dot product attention:\n\n\\[ \\displaystyle\\text{Attention}(Q,K,V)=\\text{softmax}(\\frac{QK^{T}}{\\sqrt{d_{k}}}%\n)V \\]
 52 | ```
 53 | 
 54 | Read the full introduction to Galactica models as a [PDF](https://github.com/paperswithcode/galai/blob/main/notebooks/Introduction%20to%20Galactica%20Models.pdf) or a [jupyter notebook](https://github.com/paperswithcode/galai/blob/main/notebooks/Introduction%20to%20Galactica%20Models.ipynb).
 55 | 
 56 | You can also find all the model weights with their model cards and inference widget in the [Hugging Face Hub](https://huggingface.co/models?other=galactica). All the models can be used out of the box with the `transformers` library.
 57 | 
 58 | ```bash
 59 | pip install transformers accelerate
 60 | ```
 61 | 
 62 | You can run inference using the high-level `pipeline` API
 63 | 
 64 | ```python
 65 | from transformers import pipeline
 66 | 
 67 | model = pipeline("text-generation", model="facebook/galactica-6.7b")
 68 | input_text = "The Transformer architecture [START_REF]"
 69 | model(input_text)
 70 | ``` 
 71 | 
 72 | Or for more control you can use the lower level `OPTForCausalLM` class. See the model cards of the respective repo to learn how to use the model in CPU, GPU, and different precisions.
 73 | 
 74 | ```python
 75 | from transformers import AutoTokenizer, OPTForCausalLM
 76 | 
 77 | tokenizer = AutoTokenizer.from_pretrained("facebook/galactica-6.7b")
 78 | model = OPTForCausalLM.from_pretrained("facebook/galactica-6.7b", device_map="auto")
 79 | 
 80 | input_text = "The Transformer architecture [START_REF]"
 81 | input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
 82 | 
 83 | outputs = model.generate(input_ids)
 84 | print(tokenizer.decode(outputs[0]))
 85 | ```
 86 | 
 87 | ## Capabilities
 88 | 
 89 | GALACTICA is a stand-alone LM which is not instruction tuned. Because of this you need to use the correct prompts to get good results. In this note, we go over some of the special tokens, and prompt styles you will need to use to get good results.
 90 | 
 91 | We demonstrate some examples using the standard (6.7B) model below.
 92 | 
 93 | 📚 **Predict Citations**:
 94 | 
 95 | You need to use `[START_REF]`:
 96 | 
 97 | ```python
 98 | model.generate("The Transformer architecture [START_REF]")
 99 | # The Transformer architecture [START_REF] Attention is All you Need, Vaswani[END_REF] is a sequence-to-sequence model that uses self-attention to capture long-range dependencies between input and output tokens. The Transformer has been shown to achieve state-of-the-art results on a wide range of natural
100 | ```
101 | 
102 | 🔢 **Predict LaTeX**:
103 | 
104 | ```python
105 | model.generate("The Schwarzschild radius is defined as: \\[")
106 | # The Schwarzschild radius is defined as: \\[r_{s}=\\frac{2GM}{c^{2}}\\]\n\nwhere \\(G\\) is the gravitational constant, \\(M\\) is the mass of the black hole, and
107 | ```
108 | 
109 | 🤔 **Reasoning**:
110 | 
111 | Reasoning uses the special `<work>` token:
112 | 
113 | ```python
114 | model.generate("A force of 0.6N is applied to an object, which accelerates at 3m/s. What is its mass? <work>")
115 | # What force should be applied to accelerate an object of mass 3kg to 10m/s? <work>\nWe can use Newton's second law: F = ma. We can substitute variables to get:\n\n\\[ F = \\left(66kg
116 | ```
117 | 
118 | ⚛️ **Generate Molecules**:
119 | 
120 | ```python
121 | model.generate("[START_I_SMILES]", max_length=200)
122 | # [START_I_SMILES]CCC1=CC=C(C=C1)C(=O)NC2=CC=CC(=C2)C(=O)NC3=CC=C(C=C3)S(=O)(=O)N[END_I_SMILES]\n\n### Molecular Formula\n\nC22H21N3O4S\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for 3-[[3-(4-ethylphenyl)-3-oxo-propanoyl]amino]-N-(4-sulfamoylphenyl)benzamide.\n\n### Computed Properties\n\n| Property Name | Property Value\n| --- | ----------- |\n| Molecular Weight | 423.5\n| XLogP3-AA Log P | 3.2\n| Hydrogen Bond Donor Count | 3\n| Hydrogen Bond Acceptor Count 
123 | ```
124 | 
125 | 🧑‍🔬 **Predict Protein Annotations**:
126 | 
127 | ```python
128 | model.generate("[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords", max_length=200)
129 | # '[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords\n\nCytoplasm, Methyltransferase, rRNA processing, S-adenosyl-L-methionine, Transferase\n\n## References\n\nQuestion: What are some articles for Ribosomal RNA small subunit methyltransferase H?\n\nAnswer: \n\n[START_REF] Comparative Genomics of 28 Salmonella enterica Isolates: Evidence for CRISPR-Mediated Adaptive Sublineage Evolution, Fricke[END_REF]\n\n</s>'
130 | ```
131 | 
132 | 🖱️ **Free-Form Generation**
133 | 
134 | If you want autocomplete based functionality, it is often good to experiment with turning off `new_doc=True`. This makes it more likely for the model to think it is in the middle of a document, as opposed to the beginning.
135 | 
136 | ```python
137 | model.generate("The reason why Transformers replaced RNNs was because", new_doc=False)
138 | # The reason why Transformers replaced RNNs was because they were able to capture long-term dependencies in the input sequence.\n\n# 2.2.2. Attention Mechanism\n\nThe attention mechanism was introduced in [START_REF] Neural Machine Translation by Jointly Learning to Align and Translate, Bahdan
139 | ```
140 | 
141 | ❓ **Question Answering**
142 |   
143 | In the paper we prefix questions with "Q:" or "Question:". A typical format is "Question: question.\n\nAnswer:", for example:
144 | 
145 | ```python
146 | model.generate("Question: What is the notch signaling pathway?\n\nAnswer:")
147 | # 'Question: What is the notch signaling pathway?\n\nAnswer: \n\nNotch signaling pathway is a cell-cell communication pathway that regulates cell fate decisions during development. It is involved in cell proliferation, differentiation, apoptosis, and cell migration. The Notch signaling pathway is activated by the binding of'
148 | ```
149 | 
150 | 📄 **Documents**
151 |   
152 | When starting a document, you must use the start document token for good results. To do this, set `new_doc=True` in generate:
153 | 
154 | For some article types, like Wikipedia style articles, lecture notes and GitHub repositories, use `#` to begin, e.g:
155 |   
156 | ```python
157 | model.generate("# Multi-Head Attention\n\n", new_doc=True)
158 | # # Multi-Head Attention\n\nThe multi-head attention mechanism is a generalization of the single-head attention mechanism. The multi-head attention mechanism is a combination of multiple single-head attention mechanisms. The multi-head attention mechanism is shown in Figure 2.\n\nThe multi-
159 | ```
160 |   
161 | For paper documents, use Title, e.g:
162 | 
163 | ```python
164 | model.generate("Title: Self-Supervised Learning, A Survey\n\nAuthors: John Smith\n\n", new_doc=True)
165 | # Title: Self-Supervised Learning, A Survey\n\nAuthors: John Smith\n\n# Abstract\n\nSelf-supervised learning is a class of machine learning methods that learn representations of data without the need for human-provided labels.\nIn this survey, we provide a comprehensive overview of the field
166 | ```
167 | 
168 | You can also try alternative sampling techniques for less repetitions, e.g.
169 | 
170 | ```python
171 | model.generate("Lecture 1: The Ising Model\n\n", new_doc=True, top_p=0.7, max_length=200)
172 | # 'Lecture 1: The Ising Model\n\n# 13 Introduction\n\nWe will now look at a simple model for magnetism, the Ising model, which is\na lattice model in which we consider only two spin values, up or down, and\nwe want to understand how these spins interact with each other and how\nthey get arranged in a particular state.\n\nWe will first consider the one-dimensional case, and then move on to\nthe case of two-dimensional lattices, and then to higher dimensions.\n\n# 14 The One-Dimensional Ising Model\n\n# 14.1 The Model\n\nThe one-dimensional Ising model is the simplest case of the model, in\nwhich the lattice is a line of \\(N\\) spins, each with two possible spin\nvalues, up or down. In other words, we consider a line of \\(N\\) spins\nwhere each spin can point up or down'
173 | ```
174 | 
175 | 📜 **Summarization**
176 | 
177 | You can add "TLDR:" for TLDR summaries:
178 | 
179 | ```python
180 | TEXT = """Information overload is a major obstacle to scientific progress. The explosive growth in scientific literature and data has made it ever harder to discover useful insights in a large mass of information. Today scientific knowledge is accessed through search engines, but they are unable to organize scientific knowledge alone. In this paper we introduce Galactica: a large language model that can store, combine and reason about scientific knowledge. We train on a large scientific corpus of papers, reference material, knowledge bases and many other sources. We outperform existing models on a range of scientific tasks. On technical knowledge probes such as LaTeX equations, Galactica outperforms the latest GPT-3 by 68.2% versus 49.0%. Galactica also performs well on reasoning, outperforming Chinchilla on mathematical MMLU by 41.3% to 35.7%, and PaLM 540B on MATH with a score of 20.4% versus 8.8%. It also sets a new state-of-the-art on downstream tasks such as PubMedQA and MedMCQA dev of 77.6% and 52.9%. And despite not being trained on a general corpus, Galactica outperforms BLOOM and OPT-175B on BIG-bench. We believe these results demonstrate the potential for language models as a new interface for science. We open source the model for the benefit of the scientific community."""
181 | 
182 | model.generate(TEXT + "\n\nTLDR:", max_length=400)
183 | # ...TLDR: We introduce Galactica, a large language model that can store, combine and reason about scientific knowledge.</s>
184 | ```
185 | 
186 | 💎 **Entity extraction**
187 | 
188 | You can extract entities from documents. We use the abstract example (`TEXT`) from the previous section, and add questions
189 | 
190 | ```python
191 | ENT_TEXT = TEXT + '\n\nWhat scientific entities are mentioned in the abstract above?\n\n'
192 | 
193 | model.generate(ENT_TEXT, max_length=400)
194 | # ...What scientific entities are mentioned in the abstract above?\n\nA: LaTeX equations, mathematical MMLU, MATH, PubMedQA, MedMCQA, BIG-bench</s>
195 | ```
196 | 
197 | 👨‍🔬 **IUPAC Name prediction**
198 | 
199 | For this task, we used a prompt based off the PubChem document and prompted for the completion. We use the 6.7bn model for below:
200 | 
201 | ```python
202 | context = "[START_I_SMILES]C(C(=O)O)N[END_I_SMILES]\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for"
203 | model.generate(context, max_length=400)
204 | # [START_I_SMILES]C(C(=O)O)N[END_I_SMILES]\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for 2-amino-2-oxo-acetic acid
205 | # Note this is an incorrect prediction
206 | ```
207 | 
208 | ## Citation
209 | 
210 | ```bibtex
211 | @inproceedings{GALACTICA,
212 |     title={GALACTICA: A Large Language Model for Science},
213 |     author={Ross Taylor and Marcin Kardas and Guillem Cucurull and Thomas Scialom and Anthony Hartshorn and Elvis Saravia and Andrew Poulton and Viktor Kerkez and Robert Stojnic},
214 |     year={2022}
215 | }
216 | ```
217 | 


--------------------------------------------------------------------------------
/docs/PROMPTBOOK.md:
--------------------------------------------------------------------------------
 1 | # PromptBOOK
 2 | 
 3 | **GALACTICA** is a stand-alone LM which is not instruction tuned. Because of this you need to use the correct prompts to get good results. In this note, we go over some of the special tokens, and prompt styles you will need to use to get good results.
 4 | 
 5 | ## Special Tokens
 6 | 
 7 | ### Citations
 8 | 
 9 | To cite, you need to use `[START_REF]`.
10 | 
11 | ```python
12 | model.generate("The Transformer architecture [START_REF]")
13 | # The Transformer architecture [START_REF] Attention is All you Need, Vaswani[END_REF] is a sequence-to-sequence model that uses self-attention to capture long-range dependencies between input and output tokens. The Transformer has been shown to achieve state-of-the-art results on a wide range of natural
14 | ```
15 | 
16 | ### Reasoning
17 | 
18 | To try step-by-step reasoning, use `<work>`:
19 | 
20 | ```python
21 | model.generate("A force of 0.6N is applied to an object, which accelerates at 3m/s. What is its mass? <work>")
22 | # What force should be applied to accelerate an object of mass 3kg to 10m/s? <work>\nWe can use Newton's second law: F = ma. We can substitute variables to get:\n\n\\[ F = \\left(66kg
23 | ```
24 |   
25 | ### SMILES
26 | 
27 | For standard SMILES use `[START_SMILES]`
28 | 
29 | ```python
30 | model.generate("[START_SMILES]", top_p=0.6, max_length=200)
31 | ```
32 | 
33 | For Isomeric SMILES use `[START_I_SMILES]`:
34 | 
35 | ```python
36 | model.generate("[START_I_SMILES]", top_p=0.6, max_length=200)
37 | # [START_I_SMILES]CCC1=CC=C(C=C1)C(=O)NC2=CC=CC(=C2)C(=O)NC3=CC=C(C=C3)S(=O)(=O)N[END_I_SMILES]\n\n### Molecular Formula\n\nC22H21N3O4S\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for 3-[[3-(4-ethylphenyl)-3-oxo-propanoyl]amino]-N-(4-sulfamoylphenyl)benzamide.\n\n### Computed Properties\n\n| Property Name | Property Value\n| --- | ----------- |\n| Molecular Weight | 423.5\n| XLogP3-AA Log P | 3.2\n| Hydrogen Bond Donor Count | 3\n| Hydrogen Bond Acceptor Count 
38 | ```
39 | 
40 | ### Protein Sequences
41 |   
42 | For protein sequences, use `[START_AMINO]`:
43 | 
44 | ```python
45 | model.generate("[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords", max_length=200)
46 | # '[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords\n\nCytoplasm, Methyltransferase, rRNA processing, S-adenosyl-L-methionine, Transferase\n\n## References\n\nQuestion: What are some articles for Ribosomal RNA small subunit methyltransferase H?\n\nAnswer: \n\n[START_REF] Comparative Genomics of 28 Salmonella enterica Isolates: Evidence for CRISPR-Mediated Adaptive Sublineage Evolution, Fricke[END_REF]\n\n</s>'
47 | ```
48 |   
49 | ## Documents
50 |   
51 | When starting a document, you must use the start document token for good results. To do this, set `new_doc=True` in generate:
52 | 
53 | For some article types, like Wikipedia style articles and GitHub repositories, use `#` to begin, e.g:
54 |   
55 | ```python
56 | model.generate("# Multi-Head Attention", new_doc=True)
57 | ```
58 |   
59 | For paper documents, use Title, e.g:
60 | 
61 | ```python
62 | model.generate("Title: Self-Supervised Learning, A Survey", new_doc=True)
63 | ```
64 | 
65 | ## Free-Form Generation
66 | 
67 | If you want autocomplete based functionality, it is often good to experiment with turning off `new_doc=True`. This makes it more likely for the model to think it is in the middle of a document, as opposed to the beginning.
68 | 
69 | ```python
70 | model.generate("The reason why Transformers replaced RNNs was because", new_doc=False)
71 | ```
72 | 
73 | ## Questions
74 |   
75 | In the paper we prefix questions with "Q:" or "Question:". A typical format is "Question: question.\n\nAnswer:", for example:
76 | 
77 | ```python
78 | model.generate("Question: What is the notch signaling pathway?\n\nAnswer:")
79 | ```
80 |  
81 | 


--------------------------------------------------------------------------------
/docs/model_card.md:
--------------------------------------------------------------------------------
 1 | # Model Card: GALACTICA
 2 | 
 3 | Following [Mitchell et al. (2018)](https://arxiv.org/abs/1810.03993), this model card provides information about the GALACTICA model, how it was trained, and the intended use cases. Full details about how the model was trained and evaluated can be found in the [release paper](https://galactica.org/paper.pdf).
 4 | 
 5 | ## Model Details
 6 | 
 7 | The GALACTICA models are trained on a large-scale scientific corpus. The models are designed to perform scientific tasks, including but not limited to citation prediction, scientific QA, mathematical reasoning, summarization, document generation, molecular property prediction and entity extraction. The models were developed by the Papers with Code team at Meta AI to study the use of language models for the automatic organization of science. We train models with sizes ranging from 125M to 120B parameters. Below is a summary of the released models:
 8 | 
 9 | |  Size       | Parameters  |
10 | |:-----------:|:-----------:|
11 | | `mini`      |    125 M    |
12 | | `base`      |    1.3 B    |
13 | | `standard`  |    6.7 B    |
14 | | `large`     |     30 B    |
15 | | `huge`      |    120 B    |
16 | 
17 | 
18 | ## Release Date
19 | 
20 | November 2022
21 | 
22 | ## Model Type
23 | 
24 | Transformer based architecture in a decoder-only setup with a few modifications (see paper for more details). 
25 | 
26 | ## Paper & Demo
27 | 
28 | [[Paper]](https://galactica.org/paper.pdf) / [[Demo]](https://galactica.org)
29 | 
30 | ## Model Use 
31 | 
32 | The primary intended users of the GALACTICA models are reserachers studying language models applied to the scientific domain. We also anticipate the model will be useful for developers who wish to build scientific tooling. However, we caution against production use without safeguards given the potential of language models to hallucinate.
33 | 
34 | The models are made available under a non-commercial CC BY-NC 4.0 license. More information about how to use the model can be found in the README.md of this repository.
35 | 
36 | ## Training Data
37 | 
38 | The GALACTICA models are trained on 106 billion tokens of open-access scientific text and data. This includes papers, textbooks, scientific websites, encyclopedias, reference material, knowledge bases, and more. We tokenize different modalities to provide a natural language interface for different tasks. See the README.md for more information. See the paper for full information on the training data.
39 | 
40 | ## Performance and Limitations
41 | 
42 | The model outperforms several existing language models on a range of knowledge probes, reasoning, and knowledge-intensive scientific tasks. This also extends to general NLP tasks, where GALACTICA outperforms other open source general language models. That being said, we note a number of limitations in this section.
43 | 
44 | As with other language models, GALACTICA is often prone to hallucination - and training on a high-quality academic corpus does not prevent this, especially for less popular and less cited scientific concepts. There are no guarantees of truthful output when generating form the model. This extends to specific modalities such as citation prediction. While GALACTICA's citation behaviour approaches the ground truth citation behaviour with scale, the model continues to exhibit a popularity bias at larger scales.
45 | 
46 | In addition, we evaluated the model on several types of benchmarks related to stereotypes and toxicity. Overall, the model exhibits substantially lower toxicity rates compared to other large language models. That being said, the model continues to exhibit bias on certain measures (see the paper for details). So we recommend care when using the model for generations.
47 | 
48 | ## Broader Implications
49 | 
50 | GALACTICA can potentially be used as a new way to discover academic literature. We also expect a lot of downstream use for application to particular domains, such as mathematics, biology and chemistry. In the paper, we demonstrated several examples of the model acting as alternative to standard search tools. We expect a new generation of scientific tools to be build upon large language models such as GALACTICA.
51 | 
52 | We encourage researchers to investigate beneficial and new use cases for these models. That being said, it is important to be aware of current limitations of large language models. Researchers should pay attention to common issues such as hallucination and biases that could emerge from using these models.
53 | 


--------------------------------------------------------------------------------
/docs/source/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/galai/3a724f562af1a0c8ff97a096c5fbebe579e2160f/docs/source/img/logo.png


--------------------------------------------------------------------------------
/docs/source/img/logo_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/galai/3a724f562af1a0c8ff97a096c5fbebe579e2160f/docs/source/img/logo_black.png


--------------------------------------------------------------------------------
/galai/__init__.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | from galai.model import Model
  4 | from galai.utils import ModelInfo
  5 | import torch
  6 | import warnings
  7 | from pathlib import Path
  8 | 
  9 | HF_MAPPING = {
 10 |     "mini": ("facebook/galactica-125m", torch.float32),
 11 |     "base": ("facebook/galactica-1.3b", torch.float32),
 12 |     "standard": ("facebook/galactica-6.7b", torch.float32),
 13 |     "large": ("facebook/galactica-30b", torch.float32),
 14 |     "huge": ("facebook/galactica-120b", torch.float16)
 15 | }
 16 | 
 17 | 
 18 | def load_model(
 19 |     name: str,
 20 |     dtype: Union[str, torch.dtype] = None,
 21 |     num_gpus: int = None,
 22 |     parallelize: bool = False
 23 | ):
 24 |     """
 25 |     Utility function for loading the model
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     name: str
 30 |         Name of the model
 31 | 
 32 |     dtype: str
 33 |         Optional dtype; default float32 for all models but 'huge'
 34 | 
 35 |     num_gpus : int (optional)
 36 |         Number of GPUs to use for the inference. If None, all available GPUs are used. If 0 (or if
 37 |         None and there are no GPUs) only a CPU is used. If a positive number n, then the first n CUDA
 38 |         devices are used.
 39 | 
 40 |     parallelize : bool; default False
 41 |         Specify if to use model tensor parallelizm. Ignored in CPU or single GPU inference.
 42 | 
 43 |         By the default (when parallelize is False) the multi-GPU inference is run using accelerate's
 44 |         pipeline parallelizm in which each GPU is responsible for evaluating a given subset of
 45 |         model's layers. In this mode evaluations are run sequentially. This mode is well suited for
 46 |         developing in model's internals as it is more robust in terms of recovering from exceptions
 47 |         due to not using additional processes. However, because of the sequential nature of
 48 |         pipeline parallelizm, at any given time only a single GPU is working.
 49 | 
 50 |         If parallelize is True, parallelformers' model tensor parallelizm is used instead.
 51 | 
 52 |     Returns
 53 |     ----------
 54 |     Model - model object
 55 |     """
 56 | 
 57 |     if name in HF_MAPPING:
 58 |         hf_model, default_dtype = HF_MAPPING[name]
 59 |         galai_model = True
 60 |     elif Path(name).exists():
 61 |         hf_model = name
 62 |         default_dtype = torch.float32
 63 |         galai_model = False
 64 |     else:
 65 |         raise ValueError(
 66 |             "Invalid model name. Must be one of 'mini', 'base', 'standard', 'large', 'huge', " +
 67 |             "a path to a local checkpoint dir, or a model name available on HuggingFace hub."
 68 |         )
 69 | 
 70 |     if dtype is None:
 71 |         dtype = default_dtype
 72 | 
 73 |     if isinstance(dtype, str):
 74 |         dtype = getattr(torch, dtype, None)
 75 |     if dtype not in (torch.float16, torch.float32, torch.bfloat16):
 76 |         raise ValueError(
 77 |             f"Unsupported dtype: {dtype}"
 78 |         )
 79 | 
 80 |     if dtype == torch.bfloat16 and parallelize:
 81 |         raise ValueError(
 82 |             "Model tensor parallel does not support bfloat16 dtype. Use either dtype='float16' " +
 83 |             "or dtype='float32', or disable tenros parallelizm with parallelize=False."
 84 |         )
 85 | 
 86 |     if num_gpus is None:
 87 |         if torch.cuda.is_available():
 88 |             num_gpus = torch.cuda.device_count()
 89 |         else:
 90 |             num_gpus = 0
 91 |     elif num_gpus > 0:
 92 |         # make sure CUDA is available
 93 |         if not torch.cuda.is_available():
 94 |             warnings.warn(
 95 |                 "No CUDA support detected, falling back to CPU inference. If you want to run " +
 96 |                 "inference on GPU make sure CUDA is configured correctly and pytorch is " +
 97 |                 "installed with CUDA support. Set num_gpus=None to avoid this warning.",
 98 |                 UserWarning
 99 |             )
100 |             num_gpus = 0
101 |         elif num_gpus > torch.cuda.device_count():
102 |             available = torch.cuda.device_count()
103 |             warnings.warn(
104 |                 f"num_gpus={num_gpus} is higher than the number of available CUDA devices. " +
105 |                 f"Setting it to {available}.",
106 |                 UserWarning
107 |             )
108 |             num_gpus = available
109 |     if num_gpus > 1 and parallelize and galai_model:
110 |         mi = ModelInfo.by_name(name)
111 |         if mi.num_heads % num_gpus != 0:
112 |             raise ValueError(
113 |                 f"With parallelize=True the number of model heads ({mi.num_heads} for '{name}' " +
114 |                 "model) must be divisible by the num_gpus. Adapt the number of GPUs, try a " +
115 |                 "different model or set parallelize=False"
116 |             )
117 |     if num_gpus <= 1 and parallelize:
118 |         warnings.warn(
119 |             "parallelize=True requires at least two GPUs. Setting it back to False.",
120 |             UserWarning
121 |         )
122 |         parallelize = False
123 | 
124 |     model = Model(
125 |         name=name,
126 |         dtype=dtype,
127 |         num_gpus=num_gpus,
128 |         tensor_parallel=parallelize,
129 |     )
130 |     model._set_tokenizer(hf_model)
131 |     model._load_checkpoint(checkpoint_path=hf_model)
132 | 
133 |     return model
134 | 


--------------------------------------------------------------------------------
/galai/model.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Union, List
  3 | 
  4 | import torch
  5 | 
  6 | from transformers import AutoTokenizer, OPTForCausalLM, StoppingCriteriaList, StoppingCriteria
  7 | from parallelformers import parallelize
  8 | import psutil
  9 | 
 10 | from galai.utils import escape_custom_split_sequence
 11 | 
 12 | 
 13 | __all__ = ["Model"]
 14 | 
 15 | 
 16 | class FinishedReferenceCriteria(StoppingCriteria):
 17 |     """
 18 |     A custom criteria to stop generation as soon as all the sequences in the batch have at least
 19 |     one [END_REF] marker after the prompt.
 20 |     """
 21 |     def __init__(self, prompt_length: int, end_ref_id: int):
 22 |         """
 23 |         Create a new criteria instance for a given generation run.
 24 | 
 25 |         Parameters
 26 |         ----------
 27 |         prompt_length : int
 28 |             The length of the prompt in tokens used to distinguish [END_REF] tokens in the prompt
 29 |             from the generated [END_REF] tokens. For a batch of multiple prompts of different
 30 |             lengths this should be the length of the longest prompt and other prompts should be
 31 |             padded.
 32 |         end_ref_id : int
 33 |             The [END_REF] token id.
 34 |         """
 35 |         self.prompt_length = prompt_length
 36 |         self.end_ref_id = end_ref_id
 37 | 
 38 |     def __call__(self, input_ids: torch.LongTensor, score: torch.FloatTensor, **kwargs) -> bool:
 39 |         is_end_ref = (input_ids[:, self.prompt_length:] == self.end_ref_id)
 40 |         has_end_ref = is_end_ref.any(dim=-1)
 41 |         return has_end_ref.all()
 42 | 
 43 | 
 44 | class Model(object):
 45 |     """
 46 |     Model class holding the GALACTICA models. We configure a class to encapsulate the HuggingFace model,
 47 |     the tokenizer, and the specific tokenization logic for GALACTICA. For low-level access, we recommend
 48 |     using the standard HuggingFace API.
 49 |     """
 50 | 
 51 |     def __init__(
 52 |         self,
 53 |         name: str,
 54 |         dtype: str,
 55 |         num_gpus: int,
 56 |         tensor_parallel: bool = False,
 57 |     ):
 58 |         """
 59 |         Initializes a new model
 60 | 
 61 |         Parameters
 62 |         ----------
 63 |         name : str
 64 |             Model name, e.g. `standard`.
 65 | 
 66 |         dtype: torch.dtype
 67 |             Model weights type.
 68 | 
 69 |         num_gpus : int
 70 |             Number of GPUs to use for the inference. If 0 only a CPU is used. If a positive number
 71 |             n, then the first n CUDA devices are used.
 72 | 
 73 |         tensor_parallel : bool
 74 |             Specify if to use model tensor parallelizm. Ignored in CPU or single GPU inference.
 75 |         """
 76 | 
 77 |         self.name = name
 78 |         self.dtype = dtype
 79 |         self.is_loaded = False
 80 |         self.num_gpus = num_gpus
 81 |         self.tensor_parallel = tensor_parallel
 82 |         self.max_input_length = 2020
 83 |         self._master_port = None
 84 | 
 85 |     def _load_checkpoint(self, checkpoint_path: str):
 86 |         """
 87 |         Loads the checkpoint for the model
 88 | 
 89 |         Parameters
 90 |         ----------
 91 |         checkpoint_path : str
 92 |             Path for the checkpoint (str)
 93 |         """
 94 | 
 95 |         # query available memory size of the GPUs we want to use. If tensor_parallel is True,
 96 |         # we just load the model's weights to RAM, as it needs to be sliced by parallelformers
 97 |         # before loading to VRAM.
 98 |         device_map = None
 99 |         max_memory = {}
100 |         if self.num_gpus > 0 and not self.tensor_parallel:
101 |             # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
102 |             for i in range(self.num_gpus):
103 |                 _ = torch.tensor([0], device=i)
104 |             for i in range(self.num_gpus):
105 |                 max_memory[i] = torch.cuda.mem_get_info(i)[0]
106 |             device_map = "auto"
107 |         max_memory["cpu"] = psutil.virtual_memory().available
108 | 
109 |         self.model = OPTForCausalLM.from_pretrained(
110 |             checkpoint_path,
111 |             torch_dtype=self.dtype,
112 |             low_cpu_mem_usage=True,
113 |             device_map=device_map,
114 |             max_memory=max_memory,
115 |         )
116 |         self.model.eval()
117 | 
118 |         if self.tensor_parallel:
119 |             self._parallelize()
120 | 
121 |     def _parallelize(self) -> None:
122 |         """
123 |         Parallelize the model for a tensor-parallel multi-GPU inference.
124 |         """
125 | 
126 |         if self.num_gpus < 2:
127 |             warnings.warn("At least two GPUs are required to parallelize the model.", UserWarning)
128 |             return
129 | 
130 |         self._master_port = 13000 + (id(self.model) % 32749)
131 | 
132 |         custom_policies = None
133 |         if self.model.config.model_type == "opt" and not self.model.config.enable_bias:
134 |             from galai.parallel_policy import OPTDecoderLayerPolicyNoBias
135 |             custom_policies = [OPTDecoderLayerPolicyNoBias]
136 | 
137 |         parallelize(
138 |             self.model, num_gpus=self.num_gpus, fp16=self.dtype == torch.float16,
139 |             master_port=self._master_port,
140 |             custom_policies=custom_policies,
141 |         )
142 | 
143 |     def _set_tokenizer(self, tokenizer_path: str):
144 |         """
145 |         Configures the tokenizer for the model
146 | 
147 |         Parameters
148 |         ----------
149 |         tokenizer_path : str
150 |             Path for the tokenizer (str)
151 |         """
152 |         tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
153 | 
154 |         # setup padding
155 |         tokenizer.pad_token_id = 1
156 |         tokenizer.pad_token = "<pad>"
157 |         tokenizer.padding_side = "left"
158 | 
159 |         # setup truncation
160 |         tokenizer.truncation_side = "left"
161 | 
162 |         # setup special tokens
163 |         tokenizer.bos_token_id = 0
164 |         tokenizer.bos_token = "<s>"
165 | 
166 |         tokenizer.eos_token_id = 2
167 |         tokenizer.eos_token = "</s>"
168 | 
169 |         tokenizer.unk_token = "<unk>"
170 |         tokenizer.unk_token_id = 3
171 | 
172 |         self.tokenizer = tokenizer
173 | 
174 |     def _tokenize(self, input_text: List[str], new_doc: bool) -> torch.LongTensor:
175 |         """
176 |         Apply custom preprocessing to input texts and tokenize them.
177 | 
178 |         Returns
179 |         -------
180 |             input_text : list[str]
181 |                 Texts to be tokenized
182 |             new_doc : bool
183 |                 If True, prepends the end-of-document (</s>) token to each sequence and fixes
184 |                 padding.
185 |         """
186 |         texts = []
187 |         for text in input_text:
188 |             text = escape_custom_split_sequence(text)
189 |             if not text:
190 |                 warnings.warn(
191 |                     "Found an empty input text. Changing to end-of-document token instead.",
192 |                     UserWarning
193 |                 )
194 |                 text = self.tokenizer.eos_token
195 |             texts.append(text)
196 | 
197 |         if new_doc:
198 |             pad_token = self.tokenizer.pad_token
199 |             texts = [pad_token + t for t in texts]
200 | 
201 |         encoded = self.tokenizer(
202 |             texts,
203 |             padding="longest",
204 |             max_length=self.max_input_length,
205 |             truncation=True
206 |         )
207 |         context_tokens = encoded["input_ids"]
208 |         input_v = torch.LongTensor(context_tokens).to(self.model.device)
209 | 
210 |         if new_doc:
211 |             input_v[input_v[:, 0] == self.tokenizer.pad_token_id, 0] = self.tokenizer.eos_token_id
212 |         return input_v
213 | 
214 |     @torch.inference_mode()
215 |     def generate(
216 |         self,
217 |         input_text: Union[str, List[str]],
218 |         max_length=None,
219 |         max_new_tokens=None,
220 |         new_doc=False,
221 |         top_p=None,
222 |         top_k=None,
223 |         penalty_alpha=None,
224 |         num_beams=1,
225 |         num_return_sequences=1,
226 |         return_full_text=True,
227 |     ) -> Union[str, List[str], List[List[str]]]:
228 |         """
229 |         Generates text using the model
230 | 
231 |         Parameters
232 |         ----------
233 |         input_text : str or list[str]
234 |             Input context for the model to use for its generation,
235 |             e.g. "Attention Is All You Need [START_REF]"
236 | 
237 |         max_length : int (optional)
238 |             Maximum length in tokens of the generated text (including prompt). Only one of
239 |             max_length and max_new_tokens should be specified. If neither is set, then
240 |             max_new_tokens is set to 60.
241 | 
242 |         max_new_tokens : int (optional)
243 |             Maximum length in tokens of the generated text (excluding prompt). Only one of
244 |             max_length and max_new_tokens should be specified. If neither is set, then
245 |             max_new_tokens is set to 60.
246 | 
247 |         new_doc : bool
248 |             If True, treats generation a new document, otherwise assumes generation could be
249 |             anywhere within document. Use new_doc=True if you are generating documents, e.g.
250 |             # Schwarzschild Radius, # Transformer (machine learning), 
251 |             Title: Transformers, A Survey. For general prompting, turn off. Default is False.
252 | 
253 |         top_p : float or None
254 |             If a number, e.g. 0.7, performs top p sampling. Default is None.
255 | 
256 |         top_k : int or None
257 |             If a number, performs top k sampling (if penalty_alpha is None) or contrastive search
258 |             decoding (if penalty_alpha > 0). Default is None.
259 | 
260 |         penalty_alpha : float or None
261 |             If a positive number and top_k is set, performs contrastive search decoding with top_k
262 |             candidates reranking. Default is None.
263 | 
264 |         num_beams : int, default 1
265 |             Number of beams to use in beam search.
266 | 
267 |         num_return_sequences : int, default 1
268 |             Number of generations to return for each prompt.
269 | 
270 |         Returns
271 |         ----------
272 |         str, list[str] or list[list[str]] - generated texts from the model. If input_text is a
273 |             singe string, then the output is str if num_return_sequences == 1 or a list of
274 |             strings if num_return_sequences > 1. If input_text is an iterable of strings, then the
275 |             output is either a list of strings if num_return_sequences == 1 or a list of lists of
276 |             strings, in which each inner list contains the generations for a given input prompt.
277 |         """
278 |         texts = [input_text] if isinstance(input_text, str) else input_text
279 |         input_v = self._tokenize(texts, new_doc)
280 |         options = {}
281 |         if penalty_alpha is not None:
282 |             options["penalty_alpha"] = penalty_alpha
283 |             options["top_k"] = top_k
284 |         else:
285 |             if top_p is not None:
286 |                 options["do_sample"] = True
287 |                 options["top_p"] = top_p
288 |             if top_k is not None:
289 |                 options["do_sample"] = True
290 |                 options["top_k"] = top_k
291 | 
292 |         if max_new_tokens is None and max_length is None:
293 |             max_new_tokens = 60
294 |         out = self.model.generate(
295 |             input_v,
296 |             max_length=max_length,
297 |             max_new_tokens=max_new_tokens,
298 |             return_dict_in_generate=True,
299 |             output_hidden_states=False,
300 |             num_beams=num_beams,
301 |             num_return_sequences=num_return_sequences,
302 |             **options
303 |         )
304 | 
305 |         out_tokens = out['sequences']
306 |         if not return_full_text:
307 |             out_tokens = out_tokens[:, input_v.shape[1]:]
308 |         # we keep special tokens such as [START_REF] or <work>
309 |         decoded = self.tokenizer.batch_decode(
310 |             out_tokens,
311 |             skip_special_tokens=False,
312 |             clean_up_tokenization_spaces=False,
313 |         )
314 |         # so we manually remove </s> and <pad>
315 |         decoded = [
316 |             text.replace(self.tokenizer.eos_token, "").replace(self.tokenizer.pad_token, "")
317 |             for text in decoded
318 |         ]
319 | 
320 |         if num_return_sequences == 1:
321 |             return decoded[0] if isinstance(input_text, str) else decoded
322 |         if isinstance(input_text, str):
323 |             return decoded
324 |         else:
325 |             return [
326 |                 decoded[num_return_sequences * i:num_return_sequences * (i+1)]
327 |                 for i in range(len(texts))
328 |             ]
329 | 
330 |     @torch.inference_mode()
331 |     def generate_reference(
332 |         self,
333 |         input_text: Union[str, List[str]],
334 |         max_length=None,
335 |         max_new_tokens=None,
336 |         new_doc=False,
337 |         top_p=None,
338 |         suggestions=1,
339 |         diversity_penalty=0.0,
340 |     ) -> Union[str, List[str], List[List[str]]]:
341 |         """
342 |         Generates reference.
343 | 
344 |         Parameters
345 |         ----------
346 |         input_text : str or list[str]
347 |             Input context for the model to use for its generation,
348 |             e.g. "Attention Is All You Need [START_REF]"
349 | 
350 |         max_length : int (optional)
351 |             Maximum length in tokens of the generated text (including prompt). Only one of
352 |             max_length and max_new_tokens should be specified.
353 | 
354 |         max_new_tokens : int (optional)
355 |             Maximum length in tokens of the generated text (excluding prompt). Only one of
356 |             max_length and max_new_tokens should be specified. If neither is set, then
357 |             max_new_tokens is set to 60.
358 | 
359 |         new_doc : bool
360 |             If True, treats generation a new document, otherwise assumes generation could be
361 |             anywhere within document. Use new_doc=True if you are generating documents, e.g.
362 |             # Schwarzschild Radius, # Transformer (machine learning),
363 |             Title: Transformers, A Survey. For general prompting, turn off. Default is False.
364 | 
365 |         top_p : float or None
366 |             If None, uses greedy decoding. If a number, e.g. 0.7, performs top p sampling.
367 |             Default is None.
368 | 
369 |         suggestions : int, default 1
370 |             Number of suggestions to return for each input prompt. Uses beam search to return more
371 |             suggestions. Ignored when sampling.
372 | 
373 |         diversity_penalty : float, default 0.0, ignored if sampling or suggestions == 1
374 | 
375 |         Returns
376 |         ----------
377 |         str, list[str] or list[list[str]] - generated reference suggestions from the model. If
378 |             input_text is a singe string, then the output is str if suggestions == 1 or a list of
379 |             strings if suggestions > 1. If input_text is an iterable of strings, then the output is
380 |             either a list of strings if suggestions == 1 or a list of lists of strings, in which
381 |             each inner list contains the suggestions for a given input prompt.
382 |         """
383 |         texts = [input_text] if isinstance(input_text, str) else input_text
384 |         # append [START_REF] token if missing
385 |         fixed_texts = []
386 |         for text in texts:
387 |             start_ref_pos = text.rfind("[START_REF]")
388 |             if start_ref_pos == -1:
389 |                 fixed_texts.append(text + "[START_REF]")
390 |             else:
391 |                 end_ref_pos = text.find("[END_REF]", start_ref_pos)
392 |                 if end_ref_pos != -1:
393 |                     # the last [START_REF] is closed with [END_REF], let's add another one
394 |                     fixed_texts.append(text + "[START_REF]")
395 |                 else:
396 |                     # avoid spaces after [START_REF] token for better results
397 |                     fixed_texts.append(text.rstrip())
398 | 
399 |         input_v = self._tokenize(fixed_texts, new_doc)
400 | 
401 |         prompt_length = input_v.shape[1]
402 |         finished_reference_criteria = FinishedReferenceCriteria(
403 |             prompt_length=prompt_length,
404 |             end_ref_id=self.tokenizer.convert_tokens_to_ids("[END_REF]"),
405 |         )
406 | 
407 |         if max_new_tokens is None and max_length is None:
408 |             max_new_tokens = 60
409 | 
410 |         stopping_criteria = StoppingCriteriaList([finished_reference_criteria])
411 |         if top_p is not None:
412 |             out = self.model.generate(
413 |                 input_v,
414 |                 max_length=max_length,
415 |                 max_new_tokens=max_new_tokens,
416 |                 return_dict_in_generate=True,
417 |                 output_hidden_states=False,
418 |                 top_p=top_p,
419 |                 do_sample=True,
420 |                 num_return_sequences=suggestions,
421 |                 stopping_criteria=stopping_criteria,
422 |             )
423 |         else:
424 |             out = self.model.generate(
425 |                 input_v,
426 |                 max_length=max_length,
427 |                 max_new_tokens=max_new_tokens,
428 |                 num_beams=suggestions,
429 |                 num_return_sequences=suggestions,
430 |                 num_beam_groups=suggestions if diversity_penalty > 0.0 else 1,
431 |                 diversity_penalty=diversity_penalty,
432 |                 return_dict_in_generate=True,
433 |                 output_hidden_states=False,
434 |                 stopping_criteria=stopping_criteria,
435 |             )
436 |         # cut-off the prompts
437 |         generated_tokens = out["sequences"][:, prompt_length:]
438 |         decoded = self.tokenizer.batch_decode(
439 |             generated_tokens,
440 |             skip_special_tokens=False,
441 |             clean_up_tokenization_spaces=False,
442 |         )
443 |         references = []
444 |         unfinished_generation = False
445 |         for text in decoded:
446 |             end_ref_pos = text.find("[END_REF]")
447 |             if end_ref_pos == -1:
448 |                 unfinished_generation = True
449 |                 references.append(text.strip())
450 |             else:
451 |                 references.append(text[:end_ref_pos].strip())
452 |         if unfinished_generation:
453 |             warnings.warn(
454 |                 "At least one of the generated references may be incomplete. Consider increasing max_length or max_new_tokens.",
455 |                 UserWarning
456 |             )
457 | 
458 |         if suggestions == 1:
459 |             return references[0] if isinstance(input_text, str) else references
460 |         if isinstance(input_text, str):
461 |             return references
462 |         else:
463 |             return [
464 |                 references[suggestions * i:suggestions * (i+1)]
465 |                 for i in range(len(texts))
466 |             ]
467 | 


--------------------------------------------------------------------------------
/galai/notebook_utils.py:
--------------------------------------------------------------------------------
  1 | from IPython.display import HTML
  2 | import markdown as md
  3 | import bleach
  4 | from bleach.css_sanitizer import CSSSanitizer
  5 | 
  6 | 
  7 | __all__ = ["display_markdown", "display_latex"]
  8 | 
  9 | ALLOWED_TAGS = [
 10 |     "a",
 11 |     "abbr",
 12 |     "acronym",
 13 |     "b",
 14 |     "blockquote",
 15 |     "br",
 16 |     "code",
 17 |     "div",
 18 |     "em",
 19 |     "h1",
 20 |     "h2",
 21 |     "h3",
 22 |     "h4",
 23 |     "h5",
 24 |     "i",
 25 |     "li",
 26 |     "ol",
 27 |     "strong",
 28 |     "ul",
 29 |     "span",
 30 |     "table",
 31 |     "thead",
 32 |     "tbody",
 33 |     "tr",
 34 |     "td",
 35 |     "th",
 36 |     "p",
 37 |     "pre",
 38 | ]
 39 | 
 40 | ALLOWED_ATTRIBUTES = {
 41 |     "a": ["href", "title"],
 42 |     "abbr": ["title"],
 43 |     "acronym": ["title"],
 44 |     "div": ["class"],
 45 |     "span": ["style", "class"],
 46 |     "td": ["align", "valign"],
 47 |     "th": ["align", "valign"],
 48 | }
 49 | 
 50 | ALLOWED_CSS_PROPERTIES = [
 51 |     "width", "margin", "margin-left", "margin-right",
 52 |     "margin-bottom", "margin-top", "height", "color", "font-weight"
 53 | ]
 54 | 
 55 | 
 56 | def clean_html(value, tags=None, attributes=None, css_sanitizer=None):
 57 |     if tags is None:
 58 |         tags = ALLOWED_TAGS
 59 |     if attributes is None:
 60 |         attributes = ALLOWED_ATTRIBUTES
 61 |     if css_sanitizer is None:
 62 |         css_sanitizer = CSSSanitizer(allowed_css_properties=ALLOWED_CSS_PROPERTIES)
 63 |     elif isinstance(css_sanitizer, list):
 64 |         css_sanitizer = CSSSanitizer(allowed_css_properties=css_sanitizer)
 65 | 
 66 |     cleaned = bleach.clean(
 67 |         value,
 68 |         tags=tags,
 69 |         attributes=attributes,
 70 |         css_sanitizer=css_sanitizer,
 71 |     )
 72 | 
 73 |     return cleaned
 74 | 
 75 | 
 76 | def _markdown2html_unsafe(value):
 77 |     """Converts markdown to unsanitized HTML."""
 78 |     out = md.markdown(
 79 |         value,
 80 |         extensions=[
 81 |             "markdown.extensions.tables", "fenced_code", "codehilite"
 82 |         ],
 83 |     )
 84 |     return out
 85 | 
 86 | 
 87 | def markdown2html(value):
 88 |     return clean_html(_markdown2html_unsafe(value))
 89 | 
 90 | 
 91 | def display_markdown(text):
 92 |     # normalize LaTeX tags
 93 |     text = text.replace(r"\(", "$").replace(r"\)", "$").replace(r"\[", "$$").replace(r"\]", "$$")
 94 |     # convert to markdown and sanitize
 95 |     text = markdown2html(text)
 96 |     # use IPython.display.HTML instead of IPython.display.Markdown so that the output is
 97 |     # rendered properly on notebook load without cells reevaluations
 98 |     return HTML(text)
 99 | 
100 | 
101 | def display_latex(text):
102 |     # normalize LaTeX tags
103 |     text = text.replace(r"\(", "$").replace(r"\)", "$").replace(r"\[", "$$").replace(r"\]", "$$")
104 |     # the text is going to be parsed as
105 |     text = clean_html(text, tags=[], attributes=[], css_sanitizer=[])
106 |     # use IPython.display.HTML instead of IPython.display.Latex so that the output is
107 |     # rendered properly on notebook load without cells reevaluations
108 |     return HTML(text)
109 | 


--------------------------------------------------------------------------------
/galai/parallel_policy.py:
--------------------------------------------------------------------------------
 1 | from parallelformers.policies.base import Layer, Policy
 2 | from parallelformers.utils.dist_utils import AllReduceLinear
 3 | 
 4 | from transformers.models.opt.modeling_opt import OPTDecoderLayer
 5 | 
 6 | 
 7 | __all__ = ["OPTDecoderLayerPolicyNoBias"]
 8 | 
 9 | 
10 | class OPTDecoderLayerPolicyNoBias(Policy):
11 |     @staticmethod
12 |     def replace_arguments(config, world_size):
13 |         return {
14 |             "self_attn.embed_dim": config.hidden_size // world_size,
15 |             "self_attn.num_heads": config.num_attention_heads // world_size,
16 |         }
17 | 
18 |     @staticmethod
19 |     def attn_qkv():
20 |         return [
21 |             Layer(
22 |                 weight="self_attn.q_proj.weight",
23 |             ),
24 |             Layer(
25 |                 weight="self_attn.k_proj.weight",
26 |             ),
27 |             Layer(
28 |                 weight="self_attn.v_proj.weight",
29 |             ),
30 |         ]
31 | 
32 |     @staticmethod
33 |     def attn_out():
34 |         return [
35 |             Layer(
36 |                 weight="self_attn.out_proj.weight",
37 |                 replace=AllReduceLinear,
38 |             ),
39 |         ]
40 | 
41 |     @staticmethod
42 |     def mlp_in():
43 |         return [
44 |             Layer(
45 |                 weight="fc1.weight",
46 |             ),
47 |         ]
48 | 
49 |     @staticmethod
50 |     def mlp_out():
51 |         return [
52 |             Layer(
53 |                 weight="fc2.weight",
54 |                 replace=AllReduceLinear,
55 |             ),
56 |         ]
57 | 
58 |     @staticmethod
59 |     def original_layer_class():
60 |         return OPTDecoderLayer
61 | 


--------------------------------------------------------------------------------
/galai/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List
  3 | import math
  4 | import html
  5 | 
  6 | from dataclasses import dataclass
  7 | 
  8 | 
  9 | __all__ = [
 10 |     "escape_custom_split_sequence", "ModelInfo",
 11 | ]
 12 | 
 13 | 
 14 | # we split individual characters inside special tokens like [START_DNA]
 15 | CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA|SMILES|I_SMILES|AMINO)])(.*?)(\[END_\2])")
 16 | 
 17 | # token added to implement a custom sequence tokenization. This token is added at
 18 | # corpus cleaning step and removed in pretokenization. The digits are added to increase the chance
 19 | # that they do not occur in the corpus. The digits are escaped so that the token does not appear
 20 | # literally in the source code in case we ever include it in the training data.
 21 | SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E"
 22 | 
 23 | 
 24 | def _insert_split_marker(m: re.Match):
 25 |     """
 26 |     Applies split marker based on a regex match of special tokens such as
 27 |     [START_DNA].
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     n : str
 32 |         Input text to split
 33 | 
 34 |     Returns
 35 |     ----------
 36 |     str - the text with the split token added
 37 |     """
 38 |     start_token, _, sequence, end_token = m.groups()
 39 |     sequence = re.sub(r"(.)", fr"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)
 40 |     return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}"
 41 | 
 42 | 
 43 | def escape_custom_split_sequence(text):
 44 |     """
 45 |     Applies custom splitting to the text for GALILEO's tokenization
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     text : str
 50 |         Input text to split
 51 | 
 52 |     Returns
 53 |     ----------
 54 |     str - the text with the split token added
 55 |     """
 56 |     return CUSTOM_SEQ_RE.sub(_insert_split_marker, text)
 57 | 
 58 | 
 59 | REFERENCE_RE = re.compile(r"\[START_REF\](.*?)\[END_REF\]", flags=re.DOTALL)
 60 | 
 61 | 
 62 | def extract_references_from_text(text: str) -> List[str]:
 63 |     return [cit.strip() for cit in REFERENCE_RE.findall(text)]
 64 | 
 65 | 
 66 | @dataclass
 67 | class ModelInfo:
 68 |     name: str
 69 |     num_layers: int
 70 |     num_heads: int
 71 |     head_size: int = 128
 72 |     vocab_size: int = 50000
 73 |     max_positions: int = 2048
 74 | 
 75 |     @property
 76 |     def hidden_dimension(self) -> int:
 77 |         return self.head_size * self.num_heads
 78 | 
 79 |     @property
 80 |     def parameters(self) -> int:
 81 |         layer_norm_elementwise_affine = True
 82 |         enable_bias = True
 83 |         h_dim = self.hidden_dimension
 84 |         bias = h_dim if enable_bias else 0
 85 |         embed_tokens_size = self.vocab_size * h_dim
 86 |         embed_positions_size = (self.max_positions + 2) * h_dim
 87 |         layer_norm_size = 2 * h_dim if layer_norm_elementwise_affine else 0
 88 |         self_attn_size = 4 * (h_dim * h_dim + bias)  # 4 = k_proj+v_proj+q_proj+out_proj
 89 |         ffn_dim = 4 * h_dim
 90 |         fc_size = 2 * h_dim * ffn_dim + 5 * bias  # 2 = fc1 + fc2
 91 |         decoder_layer_size = self_attn_size + fc_size + 2 * layer_norm_size
 92 |         decoder_size = self.num_layers * decoder_layer_size + layer_norm_size + embed_tokens_size + embed_positions_size
 93 | 
 94 |         return decoder_size
 95 | 
 96 |     @property
 97 |     def disk_size(self) -> int:
 98 |         """Approximate dist size in bytes of checkpoints files"""
 99 |         return self.parameters * 2
100 | 
101 |     def weights_size(self, dtype="float16") -> int:
102 |         """Approximate total size of model weights in memory"""
103 |         element_size = 2 if dtype == "float16" else 4
104 |         return self.parameters * element_size
105 | 
106 |     def memory_per_token(self, dtype="float16") -> int:
107 |         """Approximate memory size required to store intermediate activations and cached outputs"""
108 |         element_size = 2 if dtype == "float16" else 4
109 |         return 2 * self.num_layers * self.num_heads * self.head_size * element_size
110 | 
111 |     @staticmethod
112 |     def by_name(name: str) -> "ModelInfo":
113 |         return _MODEL_INFO_BY_NAME[name]
114 | 
115 |     @staticmethod
116 |     def all() -> List["ModelInfo"]:
117 |         return _MODEL_INFO
118 | 
119 | 
120 | def _humanize(parameters):
121 |     scale = min(int(math.log10(parameters)) // 3, 4)
122 |     suffix = " KMBT"[scale]
123 | 
124 |     return f"{parameters / math.pow(10, 3 * scale):.1f} {suffix}".rstrip()
125 | 
126 | 
127 | class ModelInfoList(list):
128 |     def _repr_html_(self):
129 |         if not self:
130 |             return ""
131 |         columns = {
132 |             "Name": lambda m: f"<strong>{html.escape(m.name)}</strong>",
133 |             "Parameters": lambda m: _humanize(m.parameters),
134 |             "Layers": lambda m: str(m.num_layers),
135 |             "Heads": lambda m: str(m.num_heads),
136 |             "Head Size": lambda m: str(m.head_size),
137 |             "Vocabulary Size": lambda m: str(m.vocab_size),
138 |             "Context Size": lambda m: str(m.max_positions),
139 |         }
140 |         output = ["<table><thead><tr>"]
141 |         for col in columns:
142 |             output.append(f"<th>{col}</th>")
143 |         output.append("</tr></thead><tbody>")
144 |         for mi in self:
145 |             output.append("<tr>")
146 |             for extractor in columns.values():
147 |                 output.append(f"<td>{extractor(mi)}</td>")
148 |             output.append("</tr>")
149 |         output.append("</tbody></table>")
150 |         return "".join(output)
151 | 
152 | 
153 | _MODEL_INFO = ModelInfoList([
154 |     ModelInfo("mini",      num_layers=12, num_heads=12, head_size=64),
155 |     ModelInfo("base",      num_layers=24, num_heads=32, head_size=64),
156 |     ModelInfo("standard",  num_layers=32, num_heads=32, head_size=128),
157 |     ModelInfo("large",     num_layers=48, num_heads=56, head_size=128),
158 |     ModelInfo("huge",      num_layers=96, num_heads=80, head_size=128),
159 | ])
160 | 
161 | _MODEL_INFO_BY_NAME = {model.name: model for model in _MODEL_INFO}
162 | 


--------------------------------------------------------------------------------
/notebooks/Introduction to Galactica Models.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/galai/3a724f562af1a0c8ff97a096c5fbebe579e2160f/notebooks/Introduction to Galactica Models.pdf


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.12
2 | transformers==4.25.1
3 | tokenizers
4 | parallelformers==1.2.7
5 | accelerate
6 | markdown>=3.4
7 | bleach[css]~=5.0.1
8 | psutil
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | PACKAGE_NAME = 'galai'
 4 | VERSION = "1.1.7.dev0"
 5 | DESCRIPTION = "API for the GALACTICA model"
 6 | KEYWORDS = "Scientific Intelligence"
 7 | URL = 'https://github.com/paperswithcode/galai'
 8 | AUTHOR = 'Papers with Code'
 9 | LICENSE = 'Apache License 2.0'
10 | REQUIRES_PYTHON = '>=3.7.0'
11 | EXTRAS = {}
12 | 
13 | with open("README.md", "r", encoding="utf-8") as f:
14 |     long_description = f.read()
15 | 
16 | with open("requirements.txt", "r") as f:
17 |     requirements = [line.strip() for line in f.readlines()]
18 | 
19 | setup(
20 |     name=PACKAGE_NAME,
21 |     version=VERSION,
22 |     description=DESCRIPTION,
23 |     long_description=long_description,
24 |     long_description_content_type='text/markdown',
25 |     keywords=KEYWORDS,
26 |     license=LICENSE,
27 |     author=AUTHOR,
28 |     python_requires=REQUIRES_PYTHON,
29 |     url=URL,
30 |     packages=find_packages(include=f"{PACKAGE_NAME}.*"),
31 |     install_requires=requirements,
32 |     extras_require=EXTRAS,
33 |     include_package_data=True,
34 |     classifiers=[
35 |         "Intended Audience :: Developers",
36 |         "Intended Audience :: Education",
37 |         "Intended Audience :: Science/Research",
38 |         "License :: OSI Approved :: MIT License",
39 |         "Operating System :: OS Independent",
40 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
41 |         'Programming Language :: Python',
42 |         'Programming Language :: Python :: 3',
43 |         'Programming Language :: Python :: 3.7',
44 |         'Programming Language :: Python :: 3.8',
45 |         'Programming Language :: Python :: 3.9',
46 |     ],
47 | )
48 | 


--------------------------------------------------------------------------------