├── .DS_Store ├── LICENCE ├── README.md ├── __init__.py ├── data ├── .DS_Store └── mr │ ├── MR.task.test │ ├── MR.task.test.labels │ ├── MR.task.test.sentences │ ├── MR.task.train │ ├── MR.task.train.labels │ ├── MR.task.train.sentences │ └── MR.task.unlabel ├── evaluate_batch.py ├── main_batch.py ├── nnet ├── .DS_Store ├── __init__.py ├── blstm.py └── lstm.py ├── preprocessing.py └── yutils.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/.DS_Store -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | Attribution-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-ShareAlike 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-ShareAlike 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. Share means to provide material to the public by any means or 126 | process that requires permission under the Licensed Rights, such 127 | as reproduction, public display, public performance, distribution, 128 | dissemination, communication, or importation, and to make material 129 | available to the public including in ways that members of the 130 | public may access the material from a place and at a time 131 | individually chosen by them. 132 | 133 | l. Sui Generis Database Rights means rights other than copyright 134 | resulting from Directive 96/9/EC of the European Parliament and of 135 | the Council of 11 March 1996 on the legal protection of databases, 136 | as amended and/or succeeded, as well as other essentially 137 | equivalent rights anywhere in the world. 138 | 139 | m. You means the individual or entity exercising the Licensed Rights 140 | under this Public License. Your has a corresponding meaning. 141 | 142 | 143 | Section 2 -- Scope. 144 | 145 | a. License grant. 146 | 147 | 1. Subject to the terms and conditions of this Public License, 148 | the Licensor hereby grants You a worldwide, royalty-free, 149 | non-sublicensable, non-exclusive, irrevocable license to 150 | exercise the Licensed Rights in the Licensed Material to: 151 | 152 | a. reproduce and Share the Licensed Material, in whole or 153 | in part; and 154 | 155 | b. produce, reproduce, and Share Adapted Material. 156 | 157 | 2. Exceptions and Limitations. For the avoidance of doubt, where 158 | Exceptions and Limitations apply to Your use, this Public 159 | License does not apply, and You do not need to comply with 160 | its terms and conditions. 161 | 162 | 3. Term. The term of this Public License is specified in Section 163 | 6(a). 164 | 165 | 4. Media and formats; technical modifications allowed. The 166 | Licensor authorizes You to exercise the Licensed Rights in 167 | all media and formats whether now known or hereafter created, 168 | and to make technical modifications necessary to do so. The 169 | Licensor waives and/or agrees not to assert any right or 170 | authority to forbid You from making technical modifications 171 | necessary to exercise the Licensed Rights, including 172 | technical modifications necessary to circumvent Effective 173 | Technological Measures. For purposes of this Public License, 174 | simply making modifications authorized by this Section 2(a) 175 | (4) never produces Adapted Material. 176 | 177 | 5. Downstream recipients. 178 | 179 | a. Offer from the Licensor -- Licensed Material. Every 180 | recipient of the Licensed Material automatically 181 | receives an offer from the Licensor to exercise the 182 | Licensed Rights under the terms and conditions of this 183 | Public License. 184 | 185 | b. Additional offer from the Licensor -- Adapted Material. 186 | Every recipient of Adapted Material from You 187 | automatically receives an offer from the Licensor to 188 | exercise the Licensed Rights in the Adapted Material 189 | under the conditions of the Adapter's License You apply. 190 | 191 | c. No downstream restrictions. You may not offer or impose 192 | any additional or different terms or conditions on, or 193 | apply any Effective Technological Measures to, the 194 | Licensed Material if doing so restricts exercise of the 195 | Licensed Rights by any recipient of the Licensed 196 | Material. 197 | 198 | 6. No endorsement. Nothing in this Public License constitutes or 199 | may be construed as permission to assert or imply that You 200 | are, or that Your use of the Licensed Material is, connected 201 | with, or sponsored, endorsed, or granted official status by, 202 | the Licensor or others designated to receive attribution as 203 | provided in Section 3(a)(1)(A)(i). 204 | 205 | b. Other rights. 206 | 207 | 1. Moral rights, such as the right of integrity, are not 208 | licensed under this Public License, nor are publicity, 209 | privacy, and/or other similar personality rights; however, to 210 | the extent possible, the Licensor waives and/or agrees not to 211 | assert any such rights held by the Licensor to the limited 212 | extent necessary to allow You to exercise the Licensed 213 | Rights, but not otherwise. 214 | 215 | 2. Patent and trademark rights are not licensed under this 216 | Public License. 217 | 218 | 3. To the extent possible, the Licensor waives any right to 219 | collect royalties from You for the exercise of the Licensed 220 | Rights, whether directly or through a collecting society 221 | under any voluntary or waivable statutory or compulsory 222 | licensing scheme. In all other cases the Licensor expressly 223 | reserves any right to collect such royalties. 224 | 225 | 226 | Section 3 -- License Conditions. 227 | 228 | Your exercise of the Licensed Rights is expressly made subject to the 229 | following conditions. 230 | 231 | a. Attribution. 232 | 233 | 1. If You Share the Licensed Material (including in modified 234 | form), You must: 235 | 236 | a. retain the following if it is supplied by the Licensor 237 | with the Licensed Material: 238 | 239 | i. identification of the creator(s) of the Licensed 240 | Material and any others designated to receive 241 | attribution, in any reasonable manner requested by 242 | the Licensor (including by pseudonym if 243 | designated); 244 | 245 | ii. a copyright notice; 246 | 247 | iii. a notice that refers to this Public License; 248 | 249 | iv. a notice that refers to the disclaimer of 250 | warranties; 251 | 252 | v. a URI or hyperlink to the Licensed Material to the 253 | extent reasonably practicable; 254 | 255 | b. indicate if You modified the Licensed Material and 256 | retain an indication of any previous modifications; and 257 | 258 | c. indicate the Licensed Material is licensed under this 259 | Public License, and include the text of, or the URI or 260 | hyperlink to, this Public License. 261 | 262 | 2. You may satisfy the conditions in Section 3(a)(1) in any 263 | reasonable manner based on the medium, means, and context in 264 | which You Share the Licensed Material. For example, it may be 265 | reasonable to satisfy the conditions by providing a URI or 266 | hyperlink to a resource that includes the required 267 | information. 268 | 269 | 3. If requested by the Licensor, You must remove any of the 270 | information required by Section 3(a)(1)(A) to the extent 271 | reasonably practicable. 272 | 273 | b. ShareAlike. 274 | 275 | In addition to the conditions in Section 3(a), if You Share 276 | Adapted Material You produce, the following conditions also apply. 277 | 278 | 1. The Adapter's License You apply must be a Creative Commons 279 | license with the same License Elements, this version or 280 | later, or a BY-SA Compatible License. 281 | 282 | 2. You must include the text of, or the URI or hyperlink to, the 283 | Adapter's License You apply. You may satisfy this condition 284 | in any reasonable manner based on the medium, means, and 285 | context in which You Share Adapted Material. 286 | 287 | 3. You may not offer or impose any additional or different terms 288 | or conditions on, or apply any Effective Technological 289 | Measures to, Adapted Material that restrict exercise of the 290 | rights granted under the Adapter's License You apply. 291 | 292 | 293 | Section 4 -- Sui Generis Database Rights. 294 | 295 | Where the Licensed Rights include Sui Generis Database Rights that 296 | apply to Your use of the Licensed Material: 297 | 298 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 299 | to extract, reuse, reproduce, and Share all or a substantial 300 | portion of the contents of the database; 301 | 302 | b. if You include all or a substantial portion of the database 303 | contents in a database in which You have Sui Generis Database 304 | Rights, then the database in which You have Sui Generis Database 305 | Rights (but not its individual contents) is Adapted Material, 306 | 307 | including for purposes of Section 3(b); and 308 | c. You must comply with the conditions in Section 3(a) if You Share 309 | all or a substantial portion of the contents of the database. 310 | 311 | For the avoidance of doubt, this Section 4 supplements and does not 312 | replace Your obligations under this Public License where the Licensed 313 | Rights include other Copyright and Similar Rights. 314 | 315 | 316 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 317 | 318 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 319 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 320 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 321 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 322 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 323 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 324 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 325 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 326 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 327 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 328 | 329 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 330 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 331 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 332 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 333 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 334 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 335 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 336 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 337 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 338 | 339 | c. The disclaimer of warranties and limitation of liability provided 340 | above shall be interpreted in a manner that, to the extent 341 | possible, most closely approximates an absolute disclaimer and 342 | waiver of all liability. 343 | 344 | 345 | Section 6 -- Term and Termination. 346 | 347 | a. This Public License applies for the term of the Copyright and 348 | Similar Rights licensed here. However, if You fail to comply with 349 | this Public License, then Your rights under this Public License 350 | terminate automatically. 351 | 352 | b. Where Your right to use the Licensed Material has terminated under 353 | Section 6(a), it reinstates: 354 | 355 | 1. automatically as of the date the violation is cured, provided 356 | it is cured within 30 days of Your discovery of the 357 | violation; or 358 | 359 | 2. upon express reinstatement by the Licensor. 360 | 361 | For the avoidance of doubt, this Section 6(b) does not affect any 362 | right the Licensor may have to seek remedies for Your violations 363 | of this Public License. 364 | 365 | c. For the avoidance of doubt, the Licensor may also offer the 366 | Licensed Material under separate terms or conditions or stop 367 | distributing the Licensed Material at any time; however, doing so 368 | will not terminate this Public License. 369 | 370 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 371 | License. 372 | 373 | 374 | Section 7 -- Other Terms and Conditions. 375 | 376 | a. The Licensor shall not be bound by any additional or different 377 | terms or conditions communicated by You unless expressly agreed. 378 | 379 | b. Any arrangements, understandings, or agreements regarding the 380 | Licensed Material not stated herein are separate from and 381 | independent of the terms and conditions of this Public License. 382 | 383 | 384 | Section 8 -- Interpretation. 385 | 386 | a. For the avoidance of doubt, this Public License does not, and 387 | shall not be interpreted to, reduce, limit, restrict, or impose 388 | conditions on any use of the Licensed Material that could lawfully 389 | be made without permission under this Public License. 390 | 391 | b. To the extent possible, if any provision of this Public License is 392 | deemed unenforceable, it shall be automatically reformed to the 393 | minimum extent necessary to make it enforceable. If the provision 394 | cannot be reformed, it shall be severed from this Public License 395 | without affecting the enforceability of the remaining terms and 396 | conditions. 397 | 398 | c. No term or condition of this Public License will be waived and no 399 | failure to comply consented to unless expressly agreed to by the 400 | Licensor. 401 | 402 | d. Nothing in this Public License constitutes or may be interpreted 403 | as a limitation upon, or waiver of, any privileges and immunities 404 | that apply to the Licensor or You, including from the legal 405 | processes of any jurisdiction or authority. 406 | 407 | 408 | ======================================================================= 409 | 410 | Creative Commons is not a party to its public 411 | licenses. Notwithstanding, Creative Commons may elect to apply one of 412 | its public licenses to material it publishes and in those instances 413 | will be considered the “Licensor.” The text of the Creative Commons 414 | public licenses is dedicated to the public domain under the CC0 Public 415 | Domain Dedication. Except for the limited purpose of indicating that 416 | material is shared under a Creative Commons public license or as 417 | otherwise permitted by the Creative Commons policies published at 418 | creativecommons.org/policies, Creative Commons does not authorize the 419 | use of the trademark "Creative Commons" or any other trademark or logo 420 | of Creative Commons without its prior written consent including, 421 | without limitation, in connection with any unauthorized modifications 422 | to any of its public licenses or any other arrangements, 423 | understandings, or agreements concerning use of licensed material. For 424 | the avoidance of doubt, this paragraph does not form part of the 425 | public licenses. 426 | 427 | Creative Commons may be contacted at creativecommons.org. 428 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ### The code is created by JianhuaYuan 3 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/__init__.py -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/.DS_Store -------------------------------------------------------------------------------- /data/mr/MR.task.test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/mr/MR.task.test -------------------------------------------------------------------------------- /data/mr/MR.task.test.labels: -------------------------------------------------------------------------------- 1 | 0 2 | 0 3 | 0 4 | 1 5 | 0 6 | 1 7 | 1 8 | 0 9 | 1 10 | 1 11 | 0 12 | 1 13 | 0 14 | 0 15 | 1 16 | 0 17 | 0 18 | 0 19 | 1 20 | 0 21 | 1 22 | 0 23 | 1 24 | 0 25 | 1 26 | 1 27 | 0 28 | 1 29 | 0 30 | 1 31 | 1 32 | 0 33 | 0 34 | 1 35 | 0 36 | 1 37 | 1 38 | 0 39 | 0 40 | 0 41 | 0 42 | 0 43 | 0 44 | 0 45 | 1 46 | 1 47 | 0 48 | 0 49 | 0 50 | 1 51 | 1 52 | 1 53 | 1 54 | 0 55 | 0 56 | 1 57 | 1 58 | 0 59 | 1 60 | 0 61 | 1 62 | 0 63 | 1 64 | 0 65 | 0 66 | 1 67 | 1 68 | 1 69 | 1 70 | 1 71 | 1 72 | 0 73 | 0 74 | 1 75 | 0 76 | 0 77 | 0 78 | 1 79 | 0 80 | 1 81 | 1 82 | 1 83 | 1 84 | 0 85 | 1 86 | 1 87 | 0 88 | 1 89 | 0 90 | 0 91 | 1 92 | 1 93 | 1 94 | 0 95 | 1 96 | 1 97 | 1 98 | 0 99 | 0 100 | 1 101 | 1 102 | 0 103 | 1 104 | 1 105 | 1 106 | 1 107 | 0 108 | 1 109 | 1 110 | 1 111 | 1 112 | 0 113 | 1 114 | 1 115 | 1 116 | 0 117 | 1 118 | 1 119 | 0 120 | 0 121 | 1 122 | 0 123 | 1 124 | 0 125 | 0 126 | 1 127 | 1 128 | 1 129 | 0 130 | 0 131 | 0 132 | 1 133 | 0 134 | 1 135 | 1 136 | 1 137 | 0 138 | 0 139 | 1 140 | 0 141 | 0 142 | 0 143 | 1 144 | 1 145 | 1 146 | 0 147 | 1 148 | 0 149 | 0 150 | 1 151 | 1 152 | 1 153 | 0 154 | 0 155 | 0 156 | 0 157 | 0 158 | 0 159 | 0 160 | 0 161 | 0 162 | 1 163 | 1 164 | 1 165 | 1 166 | 0 167 | 1 168 | 1 169 | 0 170 | 0 171 | 1 172 | 0 173 | 1 174 | 0 175 | 0 176 | 1 177 | 1 178 | 0 179 | 0 180 | 1 181 | 0 182 | 1 183 | 0 184 | 1 185 | 0 186 | 1 187 | 1 188 | 1 189 | 1 190 | 1 191 | 1 192 | 0 193 | 0 194 | 0 195 | 1 196 | 1 197 | 1 198 | 0 199 | 1 200 | 1 201 | 0 202 | 0 203 | 0 204 | 0 205 | 1 206 | 1 207 | 0 208 | 0 209 | 1 210 | 1 211 | 1 212 | 0 213 | 0 214 | 1 215 | 1 216 | 0 217 | 1 218 | 1 219 | 1 220 | 1 221 | 0 222 | 0 223 | 0 224 | 1 225 | 1 226 | 0 227 | 1 228 | 0 229 | 0 230 | 1 231 | 0 232 | 0 233 | 1 234 | 1 235 | 0 236 | 0 237 | 1 238 | 1 239 | 0 240 | 1 241 | 0 242 | 0 243 | 1 244 | 1 245 | 0 246 | 1 247 | 1 248 | 0 249 | 0 250 | 0 251 | 0 252 | 1 253 | 1 254 | 0 255 | 1 256 | 1 257 | 0 258 | 0 259 | 0 260 | 1 261 | 1 262 | 1 263 | 0 264 | 1 265 | 0 266 | 1 267 | 0 268 | 0 269 | 1 270 | 1 271 | 0 272 | 1 273 | 0 274 | 1 275 | 1 276 | 0 277 | 0 278 | 0 279 | 1 280 | 0 281 | 1 282 | 0 283 | 1 284 | 1 285 | 0 286 | 1 287 | 0 288 | 1 289 | 0 290 | 0 291 | 0 292 | 0 293 | 0 294 | 1 295 | 1 296 | 1 297 | 0 298 | 0 299 | 1 300 | 1 301 | 1 302 | 0 303 | 1 304 | 1 305 | 0 306 | 0 307 | 0 308 | 1 309 | 0 310 | 1 311 | 0 312 | 0 313 | 1 314 | 1 315 | 1 316 | 0 317 | 1 318 | 0 319 | 1 320 | 1 321 | 1 322 | 1 323 | 0 324 | 0 325 | 0 326 | 0 327 | 0 328 | 1 329 | 0 330 | 1 331 | 0 332 | 1 333 | 0 334 | 0 335 | 1 336 | 1 337 | 1 338 | 0 339 | 0 340 | 1 341 | 0 342 | 0 343 | 1 344 | 1 345 | 1 346 | 1 347 | 1 348 | 1 349 | 1 350 | 0 351 | 0 352 | 0 353 | 0 354 | 0 355 | 1 356 | 1 357 | 1 358 | 1 359 | 1 360 | 1 361 | 1 362 | 1 363 | 0 364 | 1 365 | 1 366 | 1 367 | 0 368 | 0 369 | 0 370 | 0 371 | 1 372 | 0 373 | 0 374 | 1 375 | 0 376 | 0 377 | 1 378 | 0 379 | 0 380 | 0 381 | 1 382 | 0 383 | 1 384 | 0 385 | 0 386 | 1 387 | 0 388 | 0 389 | 1 390 | 1 391 | 1 392 | 1 393 | 0 394 | 0 395 | 1 396 | 1 397 | 0 398 | 1 399 | 1 400 | 1 401 | -------------------------------------------------------------------------------- /data/mr/MR.task.test.sentences: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/mr/MR.task.test.sentences -------------------------------------------------------------------------------- /data/mr/MR.task.train: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/mr/MR.task.train -------------------------------------------------------------------------------- /data/mr/MR.task.train.labels: -------------------------------------------------------------------------------- 1 | 1 2 | 1 3 | 1 4 | 1 5 | 1 6 | 0 7 | 0 8 | 0 9 | 1 10 | 1 11 | 1 12 | 0 13 | 1 14 | 0 15 | 0 16 | 0 17 | 1 18 | 0 19 | 0 20 | 0 21 | 1 22 | 1 23 | 1 24 | 0 25 | 1 26 | 1 27 | 0 28 | 0 29 | 1 30 | 0 31 | 0 32 | 1 33 | 0 34 | 0 35 | 1 36 | 0 37 | 1 38 | 0 39 | 0 40 | 0 41 | 0 42 | 0 43 | 1 44 | 1 45 | 0 46 | 1 47 | 1 48 | 0 49 | 0 50 | 1 51 | 0 52 | 0 53 | 0 54 | 1 55 | 1 56 | 0 57 | 1 58 | 1 59 | 0 60 | 1 61 | 1 62 | 1 63 | 1 64 | 1 65 | 1 66 | 0 67 | 1 68 | 0 69 | 0 70 | 0 71 | 0 72 | 1 73 | 0 74 | 1 75 | 1 76 | 0 77 | 1 78 | 1 79 | 1 80 | 1 81 | 0 82 | 0 83 | 0 84 | 0 85 | 0 86 | 1 87 | 1 88 | 0 89 | 0 90 | 0 91 | 1 92 | 1 93 | 0 94 | 1 95 | 1 96 | 1 97 | 0 98 | 0 99 | 1 100 | 0 101 | 0 102 | 0 103 | 1 104 | 0 105 | 0 106 | 0 107 | 0 108 | 1 109 | 1 110 | 1 111 | 1 112 | 1 113 | 0 114 | 0 115 | 1 116 | 0 117 | 1 118 | 0 119 | 0 120 | 0 121 | 0 122 | 0 123 | 0 124 | 1 125 | 1 126 | 0 127 | 0 128 | 0 129 | 1 130 | 1 131 | 0 132 | 1 133 | 1 134 | 0 135 | 1 136 | 1 137 | 0 138 | 0 139 | 1 140 | 0 141 | 1 142 | 0 143 | 1 144 | 0 145 | 0 146 | 0 147 | 1 148 | 0 149 | 1 150 | 0 151 | 1 152 | 1 153 | 0 154 | 0 155 | 1 156 | 1 157 | 1 158 | 1 159 | 0 160 | 0 161 | 0 162 | 1 163 | 0 164 | 0 165 | 1 166 | 1 167 | 0 168 | 0 169 | 0 170 | 0 171 | 0 172 | 1 173 | 0 174 | 0 175 | 1 176 | 0 177 | 0 178 | 0 179 | 0 180 | 0 181 | 0 182 | 0 183 | 0 184 | 1 185 | 1 186 | 0 187 | 0 188 | 1 189 | 0 190 | 1 191 | 0 192 | 0 193 | 0 194 | 0 195 | 0 196 | 0 197 | 0 198 | 1 199 | 1 200 | 0 201 | 1 202 | 0 203 | 0 204 | 1 205 | 1 206 | 0 207 | 0 208 | 1 209 | 0 210 | 0 211 | 0 212 | 1 213 | 1 214 | 0 215 | 0 216 | 0 217 | 0 218 | 1 219 | 1 220 | 0 221 | 0 222 | 1 223 | 0 224 | 1 225 | 1 226 | 1 227 | 1 228 | 0 229 | 0 230 | 0 231 | 0 232 | 0 233 | 1 234 | 0 235 | 1 236 | 1 237 | 0 238 | 0 239 | 0 240 | 1 241 | 1 242 | 1 243 | 0 244 | 0 245 | 1 246 | 0 247 | 0 248 | 0 249 | 0 250 | 1 251 | 1 252 | 1 253 | 1 254 | 0 255 | 0 256 | 1 257 | 0 258 | 0 259 | 1 260 | 1 261 | 0 262 | 0 263 | 1 264 | 0 265 | 0 266 | 0 267 | 0 268 | 0 269 | 1 270 | 1 271 | 1 272 | 1 273 | 0 274 | 0 275 | 1 276 | 0 277 | 0 278 | 1 279 | 0 280 | 1 281 | 1 282 | 0 283 | 1 284 | 1 285 | 0 286 | 0 287 | 0 288 | 1 289 | 0 290 | 0 291 | 1 292 | 0 293 | 0 294 | 1 295 | 1 296 | 0 297 | 1 298 | 0 299 | 1 300 | 0 301 | 0 302 | 1 303 | 0 304 | 0 305 | 0 306 | 0 307 | 0 308 | 1 309 | 1 310 | 1 311 | 0 312 | 1 313 | 0 314 | 1 315 | 1 316 | 0 317 | 1 318 | 1 319 | 0 320 | 0 321 | 0 322 | 1 323 | 0 324 | 0 325 | 1 326 | 1 327 | 0 328 | 1 329 | 0 330 | 0 331 | 0 332 | 1 333 | 0 334 | 1 335 | 0 336 | 0 337 | 1 338 | 1 339 | 0 340 | 1 341 | 1 342 | 0 343 | 1 344 | 1 345 | 1 346 | 1 347 | 1 348 | 1 349 | 1 350 | 1 351 | 1 352 | 0 353 | 0 354 | 0 355 | 0 356 | 1 357 | 1 358 | 0 359 | 0 360 | 1 361 | 0 362 | 1 363 | 0 364 | 0 365 | 0 366 | 1 367 | 0 368 | 0 369 | 1 370 | 1 371 | 0 372 | 0 373 | 0 374 | 1 375 | 0 376 | 0 377 | 0 378 | 1 379 | 0 380 | 0 381 | 0 382 | 0 383 | 1 384 | 0 385 | 0 386 | 1 387 | 0 388 | 0 389 | 0 390 | 0 391 | 0 392 | 0 393 | 1 394 | 1 395 | 0 396 | 0 397 | 0 398 | 0 399 | 1 400 | 0 401 | 0 402 | 0 403 | 1 404 | 0 405 | 0 406 | 1 407 | 0 408 | 0 409 | 0 410 | 1 411 | 1 412 | 0 413 | 1 414 | 1 415 | 0 416 | 0 417 | 0 418 | 1 419 | 1 420 | 0 421 | 0 422 | 1 423 | 0 424 | 0 425 | 1 426 | 1 427 | 0 428 | 1 429 | 1 430 | 1 431 | 1 432 | 1 433 | 1 434 | 0 435 | 1 436 | 0 437 | 1 438 | 1 439 | 1 440 | 0 441 | 0 442 | 0 443 | 0 444 | 1 445 | 0 446 | 0 447 | 1 448 | 0 449 | 1 450 | 1 451 | 0 452 | 1 453 | 1 454 | 0 455 | 0 456 | 0 457 | 1 458 | 0 459 | 1 460 | 1 461 | 1 462 | 1 463 | 0 464 | 1 465 | 1 466 | 0 467 | 0 468 | 1 469 | 1 470 | 1 471 | 1 472 | 1 473 | 1 474 | 0 475 | 0 476 | 0 477 | 1 478 | 0 479 | 1 480 | 0 481 | 0 482 | 1 483 | 1 484 | 1 485 | 1 486 | 0 487 | 1 488 | 1 489 | 1 490 | 1 491 | 1 492 | 1 493 | 0 494 | 0 495 | 1 496 | 1 497 | 1 498 | 0 499 | 0 500 | 0 501 | 0 502 | 0 503 | 1 504 | 0 505 | 1 506 | 0 507 | 1 508 | 1 509 | 0 510 | 1 511 | 1 512 | 1 513 | 0 514 | 0 515 | 1 516 | 0 517 | 1 518 | 0 519 | 0 520 | 0 521 | 0 522 | 0 523 | 1 524 | 1 525 | 1 526 | 1 527 | 1 528 | 1 529 | 1 530 | 1 531 | 1 532 | 0 533 | 1 534 | 1 535 | 1 536 | 1 537 | 1 538 | 1 539 | 0 540 | 1 541 | 1 542 | 1 543 | 0 544 | 1 545 | 0 546 | 0 547 | 0 548 | 1 549 | 1 550 | 1 551 | 1 552 | 0 553 | 1 554 | 1 555 | 0 556 | 0 557 | 1 558 | 0 559 | 0 560 | 0 561 | 1 562 | 0 563 | 0 564 | 0 565 | 0 566 | 0 567 | 0 568 | 0 569 | 0 570 | 1 571 | 0 572 | 0 573 | 0 574 | 1 575 | 1 576 | 1 577 | 0 578 | 0 579 | 1 580 | 0 581 | 0 582 | 0 583 | 1 584 | 1 585 | 0 586 | 0 587 | 1 588 | 0 589 | 1 590 | 0 591 | 1 592 | 1 593 | 1 594 | 0 595 | 1 596 | 1 597 | 1 598 | 1 599 | 1 600 | 1 601 | 0 602 | 0 603 | 1 604 | 1 605 | 1 606 | 1 607 | 1 608 | 1 609 | 1 610 | 0 611 | 1 612 | 1 613 | 1 614 | 0 615 | 1 616 | 1 617 | 0 618 | 0 619 | 1 620 | 1 621 | 0 622 | 0 623 | 1 624 | 1 625 | 1 626 | 1 627 | 1 628 | 1 629 | 1 630 | 0 631 | 1 632 | 1 633 | 0 634 | 1 635 | 0 636 | 1 637 | 1 638 | 1 639 | 1 640 | 0 641 | 1 642 | 0 643 | 1 644 | 1 645 | 0 646 | 1 647 | 1 648 | 0 649 | 1 650 | 0 651 | 0 652 | 1 653 | 1 654 | 1 655 | 0 656 | 1 657 | 0 658 | 1 659 | 1 660 | 0 661 | 0 662 | 0 663 | 1 664 | 1 665 | 1 666 | 0 667 | 1 668 | 0 669 | 0 670 | 1 671 | 1 672 | 0 673 | 0 674 | 1 675 | 1 676 | 0 677 | 1 678 | 0 679 | 1 680 | 1 681 | 1 682 | 1 683 | 1 684 | 0 685 | 1 686 | 1 687 | 1 688 | 0 689 | 0 690 | 0 691 | 0 692 | 1 693 | 0 694 | 1 695 | 1 696 | 0 697 | 1 698 | 1 699 | 0 700 | 1 701 | 1 702 | 1 703 | 0 704 | 0 705 | 1 706 | 1 707 | 1 708 | 1 709 | 0 710 | 0 711 | 1 712 | 1 713 | 1 714 | 0 715 | 1 716 | 0 717 | 1 718 | 1 719 | 0 720 | 0 721 | 1 722 | 0 723 | 1 724 | 1 725 | 0 726 | 0 727 | 1 728 | 0 729 | 1 730 | 0 731 | 1 732 | 0 733 | 1 734 | 1 735 | 1 736 | 1 737 | 0 738 | 0 739 | 0 740 | 1 741 | 0 742 | 0 743 | 1 744 | 1 745 | 1 746 | 0 747 | 1 748 | 1 749 | 1 750 | 1 751 | 0 752 | 0 753 | 1 754 | 0 755 | 0 756 | 1 757 | 1 758 | 1 759 | 1 760 | 1 761 | 0 762 | 1 763 | 1 764 | 0 765 | 1 766 | 0 767 | 1 768 | 0 769 | 1 770 | 0 771 | 1 772 | 1 773 | 0 774 | 0 775 | 0 776 | 0 777 | 1 778 | 0 779 | 1 780 | 0 781 | 1 782 | 1 783 | 0 784 | 0 785 | 1 786 | 1 787 | 0 788 | 1 789 | 0 790 | 1 791 | 0 792 | 1 793 | 0 794 | 0 795 | 0 796 | 1 797 | 1 798 | 1 799 | 0 800 | 0 801 | 1 802 | 1 803 | 1 804 | 0 805 | 0 806 | 0 807 | 1 808 | 0 809 | 1 810 | 1 811 | 1 812 | 0 813 | 1 814 | 0 815 | 1 816 | 1 817 | 0 818 | 0 819 | 1 820 | 0 821 | 0 822 | 0 823 | 1 824 | 1 825 | 0 826 | 1 827 | 0 828 | 1 829 | 0 830 | 0 831 | 0 832 | 1 833 | 1 834 | 1 835 | 0 836 | 1 837 | 0 838 | 0 839 | 1 840 | 1 841 | 1 842 | 1 843 | 0 844 | 0 845 | 1 846 | 1 847 | 0 848 | 0 849 | 0 850 | 1 851 | 1 852 | 1 853 | 0 854 | 1 855 | 0 856 | 0 857 | 1 858 | 0 859 | 0 860 | 1 861 | 1 862 | 0 863 | 1 864 | 1 865 | 0 866 | 0 867 | 0 868 | 1 869 | 0 870 | 0 871 | 0 872 | 0 873 | 0 874 | 1 875 | 0 876 | 1 877 | 1 878 | 0 879 | 1 880 | 0 881 | 1 882 | 1 883 | 0 884 | 1 885 | 1 886 | 1 887 | 1 888 | 1 889 | 0 890 | 0 891 | 1 892 | 1 893 | 1 894 | 0 895 | 1 896 | 0 897 | 1 898 | 0 899 | 0 900 | 1 901 | 0 902 | 0 903 | 1 904 | 1 905 | 1 906 | 1 907 | 1 908 | 0 909 | 0 910 | 1 911 | 0 912 | 1 913 | 1 914 | 0 915 | 1 916 | 0 917 | 0 918 | 1 919 | 1 920 | 1 921 | 1 922 | 0 923 | 0 924 | 0 925 | 1 926 | 0 927 | 0 928 | 1 929 | 1 930 | 0 931 | 1 932 | 0 933 | 1 934 | 0 935 | 1 936 | 0 937 | 0 938 | 1 939 | 1 940 | 1 941 | 0 942 | 0 943 | 1 944 | 0 945 | 0 946 | 1 947 | 0 948 | 1 949 | 1 950 | 0 951 | 1 952 | 0 953 | 0 954 | 1 955 | 0 956 | 0 957 | 1 958 | 1 959 | 0 960 | 1 961 | 0 962 | 1 963 | 1 964 | 1 965 | 1 966 | 1 967 | 0 968 | 1 969 | 0 970 | 0 971 | 0 972 | 1 973 | 0 974 | 1 975 | 1 976 | 1 977 | 1 978 | 1 979 | 1 980 | 1 981 | 1 982 | 1 983 | 0 984 | 1 985 | 0 986 | 1 987 | 1 988 | 1 989 | 0 990 | 0 991 | 0 992 | 1 993 | 0 994 | 0 995 | 1 996 | 0 997 | 0 998 | 1 999 | 1 1000 | 0 1001 | 0 1002 | 1 1003 | 1 1004 | 0 1005 | 1 1006 | 1 1007 | 0 1008 | 0 1009 | 0 1010 | 0 1011 | 0 1012 | 0 1013 | 0 1014 | 1 1015 | 0 1016 | 0 1017 | 0 1018 | 0 1019 | 1 1020 | 1 1021 | 1 1022 | 1 1023 | 0 1024 | 1 1025 | 0 1026 | 0 1027 | 0 1028 | 1 1029 | 0 1030 | 0 1031 | 1 1032 | 1 1033 | 1 1034 | 0 1035 | 1 1036 | 0 1037 | 1 1038 | 0 1039 | 1 1040 | 0 1041 | 0 1042 | 0 1043 | 0 1044 | 0 1045 | 0 1046 | 0 1047 | 0 1048 | 1 1049 | 1 1050 | 0 1051 | 0 1052 | 0 1053 | 1 1054 | 1 1055 | 0 1056 | 0 1057 | 0 1058 | 1 1059 | 1 1060 | 1 1061 | 1 1062 | 0 1063 | 0 1064 | 0 1065 | 0 1066 | 1 1067 | 1 1068 | 1 1069 | 1 1070 | 1 1071 | 0 1072 | 1 1073 | 1 1074 | 1 1075 | 1 1076 | 1 1077 | 1 1078 | 0 1079 | 1 1080 | 1 1081 | 1 1082 | 1 1083 | 1 1084 | 0 1085 | 1 1086 | 0 1087 | 1 1088 | 1 1089 | 0 1090 | 1 1091 | 0 1092 | 1 1093 | 0 1094 | 1 1095 | 1 1096 | 1 1097 | 1 1098 | 1 1099 | 0 1100 | 0 1101 | 0 1102 | 0 1103 | 0 1104 | 0 1105 | 0 1106 | 0 1107 | 0 1108 | 1 1109 | 1 1110 | 1 1111 | 0 1112 | 0 1113 | 1 1114 | 1 1115 | 0 1116 | 0 1117 | 0 1118 | 0 1119 | 0 1120 | 0 1121 | 1 1122 | 1 1123 | 0 1124 | 1 1125 | 1 1126 | 1 1127 | 0 1128 | 0 1129 | 0 1130 | 0 1131 | 0 1132 | 0 1133 | 0 1134 | 0 1135 | 1 1136 | 0 1137 | 1 1138 | 1 1139 | 1 1140 | 1 1141 | 0 1142 | 0 1143 | 0 1144 | 0 1145 | 0 1146 | 0 1147 | 1 1148 | 0 1149 | 1 1150 | 0 1151 | 0 1152 | 1 1153 | 0 1154 | 0 1155 | 1 1156 | 1 1157 | 1 1158 | 0 1159 | 0 1160 | 1 1161 | 0 1162 | 1 1163 | 0 1164 | 1 1165 | 1 1166 | 0 1167 | 0 1168 | 1 1169 | 1 1170 | 1 1171 | 0 1172 | 0 1173 | 1 1174 | 0 1175 | 1 1176 | 1 1177 | 1 1178 | 0 1179 | 1 1180 | 1 1181 | 0 1182 | 0 1183 | 1 1184 | 0 1185 | 0 1186 | 1 1187 | 0 1188 | 1 1189 | 0 1190 | 1 1191 | 0 1192 | 1 1193 | 1 1194 | 0 1195 | 1 1196 | 0 1197 | 0 1198 | 1 1199 | 1 1200 | 0 1201 | 0 1202 | 0 1203 | 1 1204 | 0 1205 | 0 1206 | 1 1207 | 1 1208 | 1 1209 | 0 1210 | 1 1211 | 0 1212 | 1 1213 | 1 1214 | 1 1215 | 0 1216 | 0 1217 | 0 1218 | 0 1219 | 0 1220 | 1 1221 | 1 1222 | 0 1223 | 1 1224 | 0 1225 | 0 1226 | 1 1227 | 0 1228 | 1 1229 | 0 1230 | 0 1231 | 1 1232 | 0 1233 | 0 1234 | 0 1235 | 0 1236 | 0 1237 | 1 1238 | 0 1239 | 0 1240 | 0 1241 | 1 1242 | 1 1243 | 0 1244 | 0 1245 | 1 1246 | 0 1247 | 1 1248 | 1 1249 | 1 1250 | 0 1251 | 0 1252 | 0 1253 | 0 1254 | 0 1255 | 0 1256 | 1 1257 | 0 1258 | 0 1259 | 0 1260 | 1 1261 | 0 1262 | 0 1263 | 1 1264 | 0 1265 | 1 1266 | 0 1267 | 1 1268 | 0 1269 | 0 1270 | 0 1271 | 0 1272 | 1 1273 | 1 1274 | 0 1275 | 1 1276 | 1 1277 | 0 1278 | 1 1279 | 1 1280 | 1 1281 | 1 1282 | 0 1283 | 0 1284 | 1 1285 | 0 1286 | 1 1287 | 1 1288 | 0 1289 | 0 1290 | 1 1291 | 0 1292 | 0 1293 | 0 1294 | 1 1295 | 0 1296 | 0 1297 | 0 1298 | 0 1299 | 0 1300 | 0 1301 | 0 1302 | 0 1303 | 1 1304 | 1 1305 | 1 1306 | 1 1307 | 1 1308 | 0 1309 | 1 1310 | 0 1311 | 1 1312 | 1 1313 | 1 1314 | 0 1315 | 1 1316 | 1 1317 | 0 1318 | 1 1319 | 0 1320 | 1 1321 | 0 1322 | 1 1323 | 1 1324 | 0 1325 | 0 1326 | 0 1327 | 0 1328 | 0 1329 | 1 1330 | 1 1331 | 0 1332 | 1 1333 | 1 1334 | 1 1335 | 0 1336 | 0 1337 | 0 1338 | 1 1339 | 0 1340 | 1 1341 | 1 1342 | 0 1343 | 1 1344 | 0 1345 | 0 1346 | 1 1347 | 0 1348 | 0 1349 | 0 1350 | 0 1351 | 0 1352 | 1 1353 | 0 1354 | 0 1355 | 0 1356 | 0 1357 | 0 1358 | 0 1359 | 0 1360 | 0 1361 | 1 1362 | 1 1363 | 0 1364 | 0 1365 | 0 1366 | 0 1367 | 0 1368 | 1 1369 | 0 1370 | 1 1371 | 0 1372 | 0 1373 | 0 1374 | 1 1375 | 1 1376 | 0 1377 | 0 1378 | 1 1379 | 0 1380 | 1 1381 | 0 1382 | 1 1383 | 1 1384 | 1 1385 | 1 1386 | 0 1387 | 0 1388 | 0 1389 | 0 1390 | 1 1391 | 1 1392 | 1 1393 | 1 1394 | 0 1395 | 1 1396 | 1 1397 | 1 1398 | 1 1399 | 0 1400 | 0 1401 | 1 1402 | 0 1403 | 1 1404 | 1 1405 | 1 1406 | 0 1407 | 0 1408 | 0 1409 | 1 1410 | 1 1411 | 0 1412 | 1 1413 | 1 1414 | 1 1415 | 0 1416 | 1 1417 | 0 1418 | 1 1419 | 1 1420 | 0 1421 | 1 1422 | 1 1423 | 0 1424 | 1 1425 | 1 1426 | 1 1427 | 0 1428 | 0 1429 | 0 1430 | 1 1431 | 0 1432 | 0 1433 | 0 1434 | 1 1435 | 0 1436 | 1 1437 | 0 1438 | 0 1439 | 1 1440 | 0 1441 | 0 1442 | 0 1443 | 0 1444 | 0 1445 | 1 1446 | 0 1447 | 1 1448 | 1 1449 | 0 1450 | 1 1451 | 0 1452 | 0 1453 | 0 1454 | 0 1455 | 0 1456 | 1 1457 | 0 1458 | 1 1459 | 0 1460 | 1 1461 | 1 1462 | 0 1463 | 1 1464 | 0 1465 | 1 1466 | 0 1467 | 0 1468 | 1 1469 | 0 1470 | 0 1471 | 0 1472 | 1 1473 | 1 1474 | 1 1475 | 1 1476 | 0 1477 | 1 1478 | 0 1479 | 0 1480 | 1 1481 | 0 1482 | 0 1483 | 1 1484 | 0 1485 | 1 1486 | 0 1487 | 0 1488 | 0 1489 | 1 1490 | 1 1491 | 1 1492 | 0 1493 | 1 1494 | 1 1495 | 0 1496 | 0 1497 | 0 1498 | 1 1499 | 0 1500 | 1 1501 | 1 1502 | 0 1503 | 1 1504 | 0 1505 | 0 1506 | 1 1507 | 0 1508 | 0 1509 | 1 1510 | 1 1511 | 1 1512 | 0 1513 | 0 1514 | 1 1515 | 0 1516 | 1 1517 | 0 1518 | 1 1519 | 0 1520 | 1 1521 | 0 1522 | 0 1523 | 0 1524 | 0 1525 | 1 1526 | 1 1527 | 0 1528 | 1 1529 | 1 1530 | 0 1531 | 0 1532 | 0 1533 | 0 1534 | 1 1535 | 1 1536 | 0 1537 | 1 1538 | 0 1539 | 0 1540 | 0 1541 | 0 1542 | 1 1543 | 0 1544 | 0 1545 | 0 1546 | 1 1547 | 0 1548 | 0 1549 | 1 1550 | 0 1551 | 0 1552 | 1 1553 | 0 1554 | 1 1555 | 0 1556 | 0 1557 | 1 1558 | 1 1559 | 1 1560 | 0 1561 | 0 1562 | 0 1563 | 1 1564 | 0 1565 | 0 1566 | 1 1567 | 1 1568 | 0 1569 | 0 1570 | 1 1571 | 1 1572 | 0 1573 | 1 1574 | 0 1575 | 1 1576 | 1 1577 | 1 1578 | 0 1579 | 1 1580 | 0 1581 | 1 1582 | 0 1583 | 1 1584 | 1 1585 | 1 1586 | 0 1587 | 0 1588 | 1 1589 | 1 1590 | 1 1591 | 0 1592 | 0 1593 | 0 1594 | 0 1595 | 1 1596 | 0 1597 | 0 1598 | 0 1599 | 1 1600 | 0 1601 | -------------------------------------------------------------------------------- /data/mr/MR.task.train.sentences: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/mr/MR.task.train.sentences -------------------------------------------------------------------------------- /data/mr/MR.task.unlabel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/mr/MR.task.unlabel -------------------------------------------------------------------------------- /evaluate_batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding:utf8 3 | 4 | import argparse 5 | import sys 6 | import time 7 | 8 | import torch 9 | 10 | sys.path.append('../') 11 | import vectorize 12 | import yutils 13 | import preprocessing 14 | 15 | 16 | def classify_batch(args, model, targets, targets_seqlen, targets_mask, tweets, tweets_seqlen, tweets_mask): 17 | """ 18 | Predict a single batch 19 | return predictions & max_att_weight 20 | """ 21 | assert len(tweets) == len(targets) 22 | 23 | model.eval() 24 | ''' Prepare data and prediction''' 25 | batch_size = len(targets) 26 | from main_batch import var_batch 27 | targets_, targets_seqlen_, targets_mask_, tweets_, tweets_seqlen_, tweets_mask_ = \ 28 | var_batch(args, batch_size, 29 | targets, targets_seqlen, targets_mask, 30 | tweets, tweets_seqlen, tweets_mask) 31 | 32 | probs, _ = model((tweets_, tweets_seqlen_, tweets_mask_), 33 | (targets_, targets_seqlen_, targets_mask_)) 34 | 35 | pred_weight, pred = torch.max(probs, dim=1) 36 | 37 | if args.cuda: 38 | pred = pred.view(-1).cpu().data.numpy() 39 | pred_weights = pred_weight.view(-1).cpu().data.numpy() 40 | else: 41 | pred = pred.view(-1).data.numpy() 42 | pred_weights = pred_weight.view(-1).data.numpy() 43 | 44 | return pred, pred_weights 45 | 46 | 47 | def evaluate(args, model, word2idx, seged_tweets, seged_targets): 48 | """ 49 | Input: 50 | 1. list of segmented tweets 51 | 2. list of segmented targets 52 | Output: 53 | 1.list of Stance labels for tweets towards targets 54 | 55 | Procedure: 56 | - 分词结果的向量化 (用utils中的函数Word2Vec , **或者直接用JSON文件中的word2idx+embeddings) 57 | - 分词后句子的seq_len,  mask_matrix的计算 58 | - 根据是否使用GPU,Variable所有参数 59 | - 计算并返回分类结果 60 | 61 | :param seged_tweets: 62 | :param seged_targets: 63 | :param word2idx: 64 | :param args: 65 | :param model: 66 | :return: 67 | """ 68 | 69 | ''' sentences to lists_of_word_index ''' 70 | tic = time.time() 71 | tweets = vectorize.sentence_to_index(word2idx, seged_tweets) 72 | targets = vectorize.sentence_to_index(word2idx, seged_targets) 73 | ''' seq_lens and mask_matrix for each sentence ''' 74 | tweets, tweets_seqlen = yutils.get_padding(tweets, max_len=args.ans_max_len) 75 | tweets_mask = yutils.get_mask_matrix(tweets_seqlen, max_len=args.ans_max_len) 76 | targets, targets_seqlen = yutils.get_padding(targets, max_len=args.ask_max_len) 77 | targets_mask = yutils.get_mask_matrix(targets_seqlen, max_len=args.ask_max_len) 78 | assert len(tweets) == len(targets) 79 | # print tweets[0], tweets_seqlen[0], tweets_mask[0] 80 | 81 | print "--------------------" 82 | ''' Variable all parameters ''' 83 | ''' 1. decide batch_size, batch_num ''' 84 | total = len(tweets) 85 | bs = 1000 # batch_size 86 | bn = int(total / bs) # batch_num 87 | left = total - bs * bn 88 | 89 | ''' 2. classify each batch and combine the predictions, a for loop ''' 90 | pred = [] 91 | pred_weights = [] 92 | # batch_size, batch_num 93 | for b in range(bn): 94 | pred_batch, pred_weight_batch = classify_batch(args, model, 95 | targets[b * bs:(b + 1) * bs], 96 | targets_seqlen[b * bs:(b + 1) * bs], 97 | targets_mask[b * bs:(b + 1) * bs], 98 | tweets[b * bs:(b + 1) * bs], 99 | tweets_seqlen[b * bs:(b + 1) * bs], 100 | tweets_mask[b * bs:(b + 1) * bs]) 101 | pred.extend(pred_batch) 102 | pred_weights.extend(pred_weight_batch) 103 | if left > 0: 104 | pred_batch, pred_weight_batch = classify_batch(args, model, 105 | targets[bs * bn:], 106 | targets_seqlen[bs * bn:], 107 | targets_mask[bs * bn:], 108 | tweets[bs * bn:], 109 | tweets_seqlen[bs * bn:], 110 | tweets_mask[bs * bn:]) 111 | pred.extend(pred_batch) 112 | pred_weights.extend(pred_weight_batch) 113 | tit = time.time() - tic 114 | print " Predicting {:d} examples using {:5.4f} seconds".format(total, tit) 115 | 116 | ''' Adjust weights here !!!!!!!!!!!!!!!!!!!!!!''' 117 | # utils.write_list2file(pred_weights, "../data/evaluate/out_predictions_weights.txt") 118 | 119 | return pred, pred_weights 120 | 121 | 122 | def example_main(args, model, word2idx): 123 | print "Begin to classify QA pairs " 124 | """ Load and segment raw tweets|targets files """ 125 | tweets = yutils.read_file2list(args.input + "/processed/seged/a_test_tweets.txt") 126 | targets = yutils.read_file2list(args.input + "/processed/seged/a_test_targets.txt") 127 | seged_tweets = yutils.seg_sentence(tweets, choice="list", place="hpc") # may use lexicon here 128 | seged_targets = yutils.seg_sentence(targets, choice="list", place="hpc") 129 | predictions, pred_weights = evaluate(args, model, word2idx, seged_tweets, seged_targets) 130 | 131 | # for calculating 1w results 132 | yutils.write_list2file(predictions, "out_predictions.txt") 133 | yutils.write_list2file(pred_weights, "out_predictions_weights.txt") 134 | 135 | preprocessing.write_stance_txt(args.input + "SemEval2016-Task6-subtaskA-testdata.txt", 136 | "out_predictions.txt", 137 | "z_result/SemEval2016-Task6-subtaskA-testdata-pred.txt") 138 | 139 | 140 | def example_single(args, model, word2idx): 141 | """ Load and segment pair in the command line """ 142 | while True: 143 | target = raw_input("问题: ") 144 | tweet = raw_input("回答: ") 145 | targets = [str(target)] 146 | tweets = [str(tweet)] 147 | seged_tweets = yutils.seg_sentence(tweets, choice="list", place="hpc") # may use lexicon here 148 | seged_targets = yutils.seg_sentence(targets, choice="list", place="hpc") 149 | predictions = evaluate(args, model, word2idx, seged_tweets, seged_targets) 150 | print "预测结果: ", predictions 151 | 152 | 153 | def savez_model(model, model_name="np_AoABatchWinGRU_100_6025_batch8.npz"): 154 | state = model.state_dict() 155 | # output.bias [-0.09973772 0.09077224 0.00347146] 156 | print type(state), len(state), dir(state) 157 | print state.items()[-1], type(state.items()[-1]) 158 | print state.items()[-1][0], state.items()[-1][1].cpu().numpy() 159 | import numpy as np 160 | new_state = dict() 161 | for item in state.items(): 162 | new_state[item[0]] = item[1].cpu().numpy() 163 | np.savez(model_name, **new_state) 164 | state = np.load("aoa.npz") 165 | print state.files 166 | 167 | 168 | if __name__ == "__main__": 169 | parser = argparse.ArgumentParser(description="PyTorch AoA for Sogou Project") 170 | 171 | ''' load data and save model''' 172 | parser.add_argument("--input", type=str, default="../data/semeval2016/", 173 | help="location of dataset") 174 | parser.add_argument("--word2idx", type=str, default="../data/semeval2016/task_a/word2idx_glove.pkl", 175 | help="location of word2idx dictionary") 176 | parser.add_argument("--save", type=str, default="../saved_model/AoABatch/", 177 | help="path to save the model") 178 | parser.add_argument("--target", type=str, default="", 179 | help="which target to classify") 180 | 181 | parser.add_argument("--seed", type=int, default=123456, 182 | help="random seed for reproduction") 183 | parser.add_argument("--cuda", action="store_true", 184 | help="use CUDA") 185 | 186 | ''' test purpose''' 187 | parser.add_argument("--ans_max_len", type=int, default=25, 188 | help="max time step of tweet sequence") 189 | parser.add_argument("--ask_max_len", type=int, default=6, 190 | help="max time step of target sequence") 191 | 192 | example_args = parser.parse_args() 193 | 194 | ''' Load Segmentor ''' 195 | example_segmentor = yutils.load_segmentor(place="hpc") 196 | 197 | ''' Load model ''' 198 | with open(example_args.save + "/model.pt") as f: 199 | if example_args.cuda: 200 | example_model = torch.load(f) 201 | else: 202 | example_model = torch.load(f, map_location=lambda storage, loc: storage) 203 | example_model.cpu() 204 | example_model.eval() 205 | ''' Load word2idx only once ''' 206 | example_word2idx = yutils.pickle2dict(example_args.word2idx) 207 | 208 | example_main(example_args, example_model, example_word2idx) 209 | # example_single(example_args, example_model, example_word2idx) 210 | 211 | ''' TO numpy npz ''' 212 | # savez_model(example_model) 213 | 214 | # while True: 215 | # ''' 216 | # 1. segment sentences 217 | # 2. vectorize sentence, do padding and masks 218 | # 3. classify the paris and get predictions 219 | # ''' 220 | # list_of_qa_pairs = [] 221 | # updated_qa_pairs = stance_classifier(example_segmentor, example_model, list_of_qa_pairs) 222 | # 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /main_batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding:utf8 3 | 4 | import argparse 5 | import os 6 | import time 7 | from progress.bar import Bar 8 | import yutils 9 | 10 | import numpy 11 | import torch 12 | import torch.nn as nn 13 | import torch.optim as optim 14 | 15 | from torch.autograd import Variable 16 | 17 | 18 | from nnet.blstm import BLSTM 19 | from nnet.lstm import LSTM 20 | from nnet.cnn import CNN 21 | 22 | torch.manual_seed(123456) 23 | 24 | 25 | def test_prf(pred, labels): 26 | """ 27 | 4. log and return prf scores 28 | :return: 29 | """ 30 | total = len(labels) 31 | pred_right = [0, 0] 32 | pred_all = [0, 0] 33 | gold = [0, 0] 34 | for i in range(total): 35 | pred_all[pred[i]] += 1 36 | if pred[i] == labels[i]: 37 | pred_right[pred[i]] += 1 38 | gold[labels[i]] += 1 39 | 40 | print " Prediction:", pred_all, " Right:", pred_right, " Gold:", gold 41 | ''' -- for all labels -- ''' 42 | print " ****** Neg|Neu|Pos ******" 43 | accuracy = 1.0 * sum(pred_right) / total 44 | p, r, f1 = yutils.cal_prf(pred_all, pred_right, gold, formation=False) 45 | _, _, macro_f1 = yutils.cal_prf(pred_all, pred_right, gold, 46 | formation=False, 47 | metric_type="macro") 48 | print " Accuracy on test is %d/%d = %f" % (sum(pred_right), total, accuracy) 49 | print " Precision: %s\n Recall : %s\n F1 score : %s\n Macro F1 score on test (Neg|Neu|Pos) is %f" \ 50 | % (p, r, f1, macro_f1) 51 | 52 | return accuracy 53 | 54 | 55 | def test(model, dataset, args, data_part="test"): 56 | """ 57 | 58 | :param model: 59 | :param args: 60 | :param dataset: 61 | :param data_part: 62 | :return: 63 | """ 64 | 65 | tvt_set = dataset[data_part] 66 | tvt_set = yutils.YDataset(tvt_set["xIndexes"], 67 | tvt_set["yLabels"], 68 | to_pad=True, max_len=args.sen_max_len) 69 | 70 | test_set = tvt_set 71 | sentences, sentences_seqlen, sentences_mask, labels = test_set.next_batch(len(test_set)) 72 | 73 | assert len(test_set) == len(sentences) == len(labels) 74 | 75 | tic = time.time() 76 | 77 | model.eval() 78 | ''' Prepare data and prediction''' 79 | batch_size = len(sentences) 80 | sentences_, sentences_seqlen_, sentences_mask_ = \ 81 | var_batch(args, batch_size, sentences, sentences_seqlen, sentences_mask) 82 | 83 | probs = model(sentences_, sentences_seqlen_, sentences_mask_) 84 | 85 | _, pred = torch.max(probs, dim=1) 86 | 87 | if args.cuda: 88 | pred = pred.view(-1).cpu().data.numpy() 89 | else: 90 | pred = pred.view(-1).data.numpy() 91 | 92 | tit = time.time() - tic 93 | print " Predicting {:d} examples using {:5.4f} seconds".format(len(test_set), tit) 94 | 95 | labels = numpy.asarray(labels) 96 | ''' log and return prf scores ''' 97 | accuracy = test_prf(pred, labels) 98 | 99 | return accuracy 100 | 101 | 102 | def var_batch(args, batch_size, sentences, sentences_seqlen, sentences_mask): 103 | """ 104 | Transform the input batch to PyTorch variables 105 | :return: 106 | """ 107 | # dtype = torch.from_numpy(sentences, dtype=torch.cuda.LongTensor) 108 | sentences_ = Variable(torch.LongTensor(sentences).view(batch_size, args.sen_max_len)) 109 | sentences_seqlen_ = Variable(torch.LongTensor(sentences_seqlen).view(batch_size, 1)) 110 | sentences_mask_ = Variable(torch.LongTensor(sentences_mask).view(batch_size, args.sen_max_len)) 111 | 112 | if args.cuda: 113 | sentences_ = sentences_.cuda() 114 | sentences_seqlen_ = sentences_seqlen_.cuda() 115 | sentences_mask_ = sentences_mask_.cuda() 116 | 117 | return sentences_, sentences_seqlen_, sentences_mask_ 118 | 119 | 120 | def train(model, training_data, args, optimizer, criterion): 121 | model.train() 122 | 123 | batch_size = args.batch_size 124 | 125 | sentences, sentences_seqlen, sentences_mask, labels = training_data 126 | 127 | # print batch_size, len(sentences), len(labels) 128 | 129 | assert batch_size == len(sentences) == len(labels) 130 | 131 | ''' Prepare data and prediction''' 132 | sentences_, sentences_seqlen_, sentences_mask_ = \ 133 | var_batch(args, batch_size, sentences, sentences_seqlen, sentences_mask) 134 | labels_ = Variable(torch.LongTensor(labels)) 135 | if args.cuda: 136 | labels_ = labels_.cuda() 137 | 138 | assert len(sentences) == len(labels) 139 | 140 | model.zero_grad() 141 | probs = model(sentences_, sentences_seqlen_, sentences_mask_) 142 | loss = criterion(probs.view(len(labels_), -1), labels_) 143 | 144 | loss.backward() 145 | optimizer.step() 146 | 147 | 148 | def main(args): 149 | # define location to save the model 150 | if args.save == "__": 151 | # LSTM_100_40_8 152 | args.save = "saved_model/%s_%d_%d_%d" % \ 153 | (args.model, args.nhid, args.sen_max_len, args.batch_size) 154 | 155 | in_dir = "data/mr/" 156 | dataset = yutils.pickle2dict(in_dir + "features_glove.pkl") 157 | 158 | if args.is_test: 159 | with open(args.save + "/model.pt") as f: 160 | model = torch.load(f) 161 | test(model, dataset, args) 162 | 163 | else: 164 | ''' make sure the folder to save models exist ''' 165 | if not os.path.exists(args.save): 166 | os.mkdir(args.save) 167 | 168 | embeddings = yutils.pickle2dict(in_dir + "embeddings_glove.pkl") 169 | dataset["embeddings"] = embeddings 170 | emb_np = numpy.asarray(embeddings, dtype=numpy.float32) # from_numpy 171 | emb = torch.from_numpy(emb_np) 172 | 173 | models = {"LSTM": LSTM, "BLSTM": BLSTM, "CNN": CNN} 174 | model = models[args.model](embeddings=emb, 175 | input_dim=args.embsize, 176 | hidden_dim=args.nhid, 177 | num_layers=args.nlayers, 178 | output_dim=2, 179 | max_len=args.sen_max_len, 180 | dropout=args.dropout) 181 | 182 | if torch.cuda.is_available(): 183 | if not args.cuda: 184 | print "Waring: You have a CUDA device, so you should probably run with --cuda" 185 | else: 186 | torch.cuda.manual_seed(args.seed) 187 | model.cuda() 188 | 189 | optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=1e-5) 190 | criterion = nn.CrossEntropyLoss() 191 | 192 | training_set = dataset["training"] 193 | training_set = yutils.YDataset(training_set["xIndexes"], 194 | training_set["yLabels"], 195 | to_pad=True, 196 | max_len=args.sen_max_len) 197 | 198 | best_acc_test, best_acc_valid = -numpy.inf, -numpy.inf 199 | batches_per_epoch = int(len(training_set)/args.batch_size) 200 | print "--------------\nEpoch 0 begins!" 201 | max_train_steps = int(args.epochs * batches_per_epoch * 10) 202 | bar = Bar(" Processing", max=max_train_steps) 203 | tic = time.time() 204 | print "-----------------------------", max_train_steps, len(training_set), args.batch_size 205 | 206 | for step in xrange(max_train_steps): 207 | bar.next() 208 | training_batch = training_set.next_batch(args.batch_size) 209 | 210 | train(model, training_batch, args, optimizer, criterion) 211 | 212 | if (step+1) % batches_per_epoch == 0: 213 | print " using %.5f seconds" % (time.time() - tic) 214 | tic = time.time() 215 | ''' Test after each epoch ''' 216 | print "\n Begin to predict the results on Validation" 217 | acc_score = test(model, dataset, args, data_part="validation") 218 | 219 | print " ----Old best acc score on validation is %f" % best_acc_valid 220 | if acc_score > best_acc_valid: 221 | print " ----New acc score on validation is %f" % acc_score 222 | best_acc_valid = acc_score 223 | with open(args.save + "/model.pt", 'wb') as to_save: 224 | torch.save(model, to_save) 225 | 226 | acc_test = test(model, dataset, args) 227 | print " ----Old best acc score on test is %f" % best_acc_test 228 | if acc_test > best_acc_test: 229 | best_acc_test = acc_test 230 | print " ----New acc score on test is %f" % acc_test 231 | 232 | print "--------------\nEpoch %d begins!" % (training_set.epochs_completed + 1) 233 | 234 | # print the final result 235 | with open(args.save + "/model.pt") as f: 236 | model = torch.load(f) 237 | test(model, dataset, args) 238 | bar.finish() 239 | 240 | 241 | if __name__ == "__main__": 242 | parser = argparse.ArgumentParser(description="PyTorch AoA for Stance Project") 243 | 244 | ''' load data and save model''' 245 | parser.add_argument("--save", type=str, default="__", 246 | help="path to save the model") 247 | 248 | ''' model parameters ''' 249 | parser.add_argument("--model", type=str, default="BLSTM", 250 | help="type of model to use for Stance Project") 251 | parser.add_argument("--embsize", type=int, default=100, 252 | help="size of word embeddings") 253 | parser.add_argument("--emb", type=str, default="glove", 254 | help="type of word embeddings") 255 | parser.add_argument("--nhid", type=int, default=50, 256 | help="size of RNN hidden layer") 257 | parser.add_argument("--nlayers", type=int, default=1, 258 | help="number of layers of LSTM") 259 | parser.add_argument("--lr", type=float, default=0.01, 260 | help="learning rate") 261 | parser.add_argument("--epochs", type=int, default=100, 262 | help="number of training epoch") 263 | parser.add_argument("--batch_size", type=int, default=8, 264 | help="batch size") 265 | parser.add_argument("--dropout", type=float, default=0.1, 266 | help="dropout rate") 267 | parser.add_argument("--seed", type=int, default=123456, 268 | help="random seed for reproduction") 269 | parser.add_argument("--cuda", action="store_true", 270 | help="use CUDA") 271 | 272 | parser.add_argument("--sen_max_len", type=int, default=40, 273 | help="max time step of tweet sequence") 274 | ''' test purpose''' 275 | parser.add_argument("--is_test", action="store_true", 276 | help="flag for training model or only test") 277 | 278 | my_args = parser.parse_args() 279 | 280 | torch.manual_seed(my_args.seed) 281 | 282 | main(my_args) 283 | -------------------------------------------------------------------------------- /nnet/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/nnet/.DS_Store -------------------------------------------------------------------------------- /nnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/nnet/__init__.py -------------------------------------------------------------------------------- /nnet/blstm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding:utf8 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | torch.manual_seed(123456) 10 | 11 | 12 | class BLSTM(nn.Module): 13 | """ 14 | Implementation of BLSTM Concatenation for sentiment classification task 15 | """ 16 | 17 | def __init__(self, embeddings, input_dim, hidden_dim, num_layers, output_dim, max_len=40, dropout=0.5): 18 | super(BLSTM, self).__init__() 19 | 20 | self.emb = nn.Embedding(num_embeddings=embeddings.size(0), 21 | embedding_dim=embeddings.size(1), 22 | padding_idx=0) 23 | self.emb.weight = nn.Parameter(embeddings) 24 | 25 | self.input_dim = input_dim 26 | self.hidden_dim = hidden_dim 27 | self.output_dim = output_dim 28 | 29 | # sen encoder 30 | self.sen_len = max_len 31 | self.sen_rnn = nn.LSTM(input_size=input_dim, 32 | hidden_size=hidden_dim, 33 | num_layers=num_layers, 34 | dropout=dropout, 35 | batch_first=True, 36 | bidirectional=True) 37 | 38 | self.output = nn.Linear(2 * self.hidden_dim, output_dim) 39 | 40 | def bi_fetch(self, rnn_outs, seq_lengths, batch_size, max_len): 41 | rnn_outs = rnn_outs.view(batch_size, max_len, 2, -1) 42 | 43 | # (batch_size, max_len, 1, -1) 44 | fw_out = torch.index_select(rnn_outs, 2, Variable(torch.LongTensor([0])).cuda()) 45 | fw_out = fw_out.view(batch_size * max_len, -1) 46 | bw_out = torch.index_select(rnn_outs, 2, Variable(torch.LongTensor([1])).cuda()) 47 | bw_out = bw_out.view(batch_size * max_len, -1) 48 | 49 | batch_range = Variable(torch.LongTensor(range(batch_size))).cuda() * max_len 50 | batch_zeros = Variable(torch.zeros(batch_size).long()).cuda() 51 | 52 | fw_index = batch_range + seq_lengths.view(batch_size) - 1 53 | fw_out = torch.index_select(fw_out, 0, fw_index) # (batch_size, hid) 54 | 55 | bw_index = batch_range + batch_zeros 56 | bw_out = torch.index_select(bw_out, 0, bw_index) 57 | 58 | outs = torch.cat([fw_out, bw_out], dim=1) 59 | return outs 60 | 61 | def forward(self, sen_batch, sen_lengths, sen_mask_matrix): 62 | """ 63 | 64 | :param sen_batch: (batch, sen_length), tensor for sentence sequence 65 | :param sen_lengths: 66 | :param sen_mask_matrix: 67 | :return: 68 | """ 69 | 70 | ''' Embedding Layer | Padding | Sequence_length 40''' 71 | sen_batch = self.emb(sen_batch) 72 | 73 | batch_size = len(sen_batch) 74 | 75 | ''' Bi-LSTM Computation ''' 76 | sen_outs, _ = self.sen_rnn(sen_batch.view(batch_size, -1, self.input_dim)) 77 | sen_rnn = sen_outs.contiguous().view(batch_size, -1, 2 * self.hidden_dim) # (batch, sen_len, 2*hid) 78 | 79 | ''' Fetch the truly last hidden layer of both sides 80 | ''' 81 | sentence_batch = self.bi_fetch(sen_rnn, sen_lengths, batch_size, self.sen_len) # (batch_size, 2*hid) 82 | 83 | representation = sentence_batch 84 | out = self.output(representation) 85 | out_prob = F.softmax(out.view(batch_size, -1)) 86 | 87 | return out_prob 88 | -------------------------------------------------------------------------------- /nnet/lstm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding:utf8 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | torch.manual_seed(123456) 10 | 11 | 12 | class LSTM(nn.Module): 13 | """ 14 | Implementation of BLSTM Concatenation for Stance Classification Task 15 | Final representation is concatenation of last hidden layer of both sentence and ask blstm 16 | """ 17 | 18 | def __init__(self, embeddings, input_dim, hidden_dim, num_layers, output_dim, max_len=40, dropout=0.5): 19 | super(LSTM, self).__init__() 20 | 21 | self.emb = nn.Embedding(num_embeddings=embeddings.size(0), 22 | embedding_dim=embeddings.size(1), 23 | padding_idx=0) 24 | self.emb.weight = nn.Parameter(embeddings) 25 | 26 | self.input_dim = input_dim 27 | self.hidden_dim = hidden_dim 28 | self.output_dim = output_dim 29 | 30 | # sen encoder 31 | self.sen_len = max_len 32 | self.sen_rnn = nn.LSTM(input_size=input_dim, 33 | hidden_size=hidden_dim, 34 | num_layers=num_layers, 35 | dropout=dropout, 36 | batch_first=True, 37 | bidirectional=False) 38 | 39 | self.output = nn.Linear(self.hidden_dim, output_dim) 40 | 41 | def _fetch(self, rnn_outs, seq_lengths, batch_size, max_len): 42 | rnn_outs = rnn_outs.view(batch_size, max_len, 1, -1) 43 | 44 | # (batch_size, max_len, 1, -1) 45 | fw_out = torch.index_select(rnn_outs, 2, Variable(torch.LongTensor([0])).cuda()) 46 | fw_out = fw_out.view(batch_size * max_len, -1) 47 | 48 | batch_range = Variable(torch.LongTensor(range(batch_size))).cuda() * max_len 49 | 50 | fw_index = batch_range + seq_lengths.view(batch_size) - 1 51 | fw_out = torch.index_select(fw_out, 0, fw_index) # (batch_size, hid) 52 | 53 | return fw_out 54 | 55 | def forward(self, sen_batch, sen_lengths, sen_mask_matrix): 56 | """ 57 | 58 | :param sen_batch: (batch, sen_length), tensor for sentence sequence 59 | :param sen_lengths: 60 | :param sen_mask_matrix: 61 | :return: 62 | """ 63 | 64 | ''' Embedding Layer | Padding | Sequence_length 40''' 65 | sen_batch = self.emb(sen_batch) 66 | 67 | batch_size = len(sen_batch) 68 | 69 | ''' Bi-LSTM Computation ''' 70 | sen_outs, _ = self.sen_rnn(sen_batch.view(batch_size, -1, self.input_dim)) 71 | 72 | # Batch_first only change viewpoint, may not be contiguous 73 | sen_rnn = sen_outs.contiguous().view(batch_size, -1, self.hidden_dim) # (batch, sen_len, 2*hid) 74 | 75 | ''' Fetch the truly last hidden layer of both sides 76 | ''' 77 | sentence_batch = self._fetch(sen_rnn, sen_lengths, batch_size, self.sen_len) # (batch_size, hid) 78 | 79 | representation = sentence_batch 80 | out = self.output(representation) 81 | out_prob = F.softmax(out.view(batch_size, -1)) 82 | 83 | return out_prob 84 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding:utf-8 3 | """ 4 | This is the old version of vectorization, maybe used in research work instead of engineering one. 5 | Tips: 6 | - Embeddings are extracted to numpy matrix 7 | - Use pickle instead of json file to avoid string variations ??? 8 | - Vectorization and padding can be done together 9 | """ 10 | import sys 11 | import numpy as np 12 | 13 | import yutils 14 | 15 | reload(sys) 16 | sys.setdefaultencoding('utf-8') 17 | np.random.seed(1234567) 18 | 19 | ################# 20 | # read text files 21 | ################# 22 | 23 | 24 | def read_mr_txt(filename="data/mr/"): 25 | """ 26 | Labeled data format 27 | 28 | :param filename: 29 | :return: 30 | """ 31 | raw_data = yutils.read_file2list(filename) 32 | 33 | sentences = [] 34 | labels = [] # 0 1 35 | 36 | for line in raw_data: 37 | label, sentence = line.split("\t") 38 | 39 | sentences.append(sentence) 40 | labels.append(label) 41 | 42 | assert len(sentences) == len(labels) 43 | sentences = yutils.tokenize_sentence(sentences, choice="list") 44 | sentences = [yutils.list2string(sentence) for sentence in sentences] 45 | return sentences, labels 46 | 47 | ################# 48 | # read embeddings 49 | ################# 50 | 51 | 52 | def read_emb_idx(filename): 53 | """ 54 | 1.read embeddings files to 55 | "embeddings": numpy matrix, each row is a vector with corresponding index 56 | "word2idx": word2idx[word] = idx in the "embeddings" matrix 57 | "idx2word": the reverse dict of "word2idx" 58 | 2. add padding and unk to 3 dictionaries 59 | :param filename: 60 | file format: wordemb, '\n' (line[0], line[1:-1], line[-1]) 61 | :return: 62 | vocab = {"embeddings": embeddings, "word2idx": word2idx, "idx2word": idx2word} 63 | """ 64 | with open(filename, 'r') as f: 65 | embeddings = [] 66 | word2idx = dict() 67 | 68 | word2idx["_padding"] = 0 # PyTorch Embedding lookup need padding to be zero 69 | word2idx["_unk"] = 1 70 | 71 | for line in f: 72 | line = line.strip() 73 | one = line.split(' ') 74 | word = one[0] 75 | emb = [float(i) for i in one[1:]] 76 | embeddings.append(emb) 77 | word2idx[word] = len(word2idx) 78 | 79 | ''' Add padding and unknown word to embeddings and word2idx''' 80 | emb_dim = len(embeddings[0]) 81 | embeddings.insert(0, np.zeros(emb_dim)) # _padding 82 | embeddings.insert(1, np.random.random(emb_dim)) # _unk 83 | 84 | embeddings = np.asarray(embeddings, dtype=np.float32) 85 | embeddings = embeddings.reshape(len(embeddings), emb_dim) 86 | 87 | idx2word = dict((word2idx[word], word) for word in word2idx) 88 | vocab = {"embeddings": embeddings, "word2idx": word2idx, "idx2word": idx2word} 89 | 90 | print "Finish loading embedding %s * * * * * * * * * * * *" % filename 91 | return vocab 92 | 93 | 94 | ############################################################# 95 | """ Raw data --> pickle 96 | output file style looks like this: 97 | {"training":{ 98 | "xIndexes":[] 99 | "yLabels":[] 100 | } 101 | "validation": ... 102 | "test": ... 103 | "word2idx":{"_padding":0,"_unk":1, "1st":2, "hello":3, ...} 104 | "embedding":[ [word0], [word1], [word2], ...] 105 | } 106 | """ 107 | ################# 108 | # evaluation 109 | ################# 110 | 111 | 112 | def sentence_to_index(word2idx, sentences): 113 | """ 114 | Transform sentence into lists of word index 115 | :param word2idx: 116 | word2idx = {word:idx, ...} 117 | :param sentences: 118 | list of sentences which are list of word 119 | :return: 120 | """ 121 | print "-------------begin making sentence xIndexes-------------" 122 | sentences_indexes = [] 123 | for sentence in sentences: 124 | s_index = [] 125 | for word in sentence: 126 | word = word 127 | if word == "\n": 128 | continue 129 | if word in word2idx: 130 | s_index.append(word2idx[word]) 131 | else: 132 | s_index.append(word2idx["_unk"]) 133 | print " --", word, "-- " 134 | 135 | if len(s_index) == 0: 136 | print len(sentence), "+++++++++++++++++++++++++++++++++empty sentence" 137 | s_index.append(word2idx["_unk"]) 138 | sentences_indexes.append(s_index) 139 | assert len(sentences_indexes) == len(sentences) 140 | print "-------------finish making sentence xIndexes-------------" 141 | return sentences_indexes 142 | 143 | 144 | def make_datasets(word2idx, raw_data): 145 | """ 146 | :param word2idx: 147 | word2idx = {word:idx, ...} 148 | :param raw_data: 149 | raw_data = {"training": (inputs, labels), 150 | "validation", 151 | "test"} 152 | :return: 153 | """ 154 | datasets = dict() 155 | 156 | for i in ["training", "validation", "test"]: 157 | sentences, labels = raw_data[i] 158 | xIndexes = sentence_to_index(word2idx, sentences) 159 | yLabels = [int(label) for label in labels] 160 | yLabels = np.asarray(yLabels, dtype=np.int64).reshape(len(labels)) 161 | datasets[i] = {"xIndexes": xIndexes, 162 | "yLabels": yLabels} 163 | 164 | return datasets 165 | 166 | ############################################################# 167 | 168 | 169 | def processing(args): 170 | input_dir = "data/mr/" 171 | output_dir = input_dir 172 | # read raw text 173 | data = [] # sentences, labels 174 | fns = ["data/mr/MR.task.train", 175 | "data/mr/MR.task.test"] 176 | for fn in fns: 177 | # sentences, labels 178 | sentences = yutils.read_file2lol(fn + ".sentences") 179 | labels = yutils.read_file2list(fn + ".labels") 180 | data.append([sentences, labels]) 181 | 182 | assert len(data[0][0]) == len(data[0][1]) 183 | assert len(data[1][0]) == len(data[1][1]) 184 | 185 | # split the dataset: train, test 186 | yutils.shuffle(data[0], seed=123456) 187 | test = data[1] 188 | if args.has_valid: 189 | train_num = int(len(data[0][0]) * 0.8) 190 | train = [d[:train_num] for d in data[0]] 191 | valid = [d[train_num:] for d in data[0]] 192 | else: 193 | train = data[0] 194 | valid = test 195 | 196 | assert len(train[0]) == len(train[1]) 197 | assert len(valid[0]) == len(valid[1]) 198 | assert len(test[0]) == len(test[1]) 199 | 200 | raw_data = {"training": train, 201 | "validation": valid, 202 | "test": test} 203 | 204 | # read the embedding files 205 | run_place = {"hpc": "/users2/jhyuan/", "local": "/Users/Isaac/athand/Code/"} 206 | emb_file = run_place[args.place] + "nlp_res/embeddings/glove/glove.6B.100d.txt" 207 | vocab = read_emb_idx(emb_file) 208 | word2idx, embeddings = vocab["word2idx"], vocab["embeddings"] 209 | 210 | # transform sentence to word index 211 | datasets = make_datasets(word2idx, raw_data) 212 | 213 | # output the transformed files 214 | yutils.dict2pickle(datasets, output_dir + "/features_glove.pkl") 215 | yutils.dict2pickle(word2idx, output_dir + "/word2idx_glove.pkl") 216 | yutils.dict2pickle(embeddings, output_dir + "/embeddings_glove.pkl") 217 | 218 | # test correctness 219 | word2idx = yutils.pickle2dict(output_dir + "/word2idx_glove.pkl") 220 | print word2idx["_padding"], word2idx["_unk"] 221 | 222 | 223 | if __name__ == "__main__": 224 | import argparse 225 | 226 | parser = argparse.ArgumentParser(description="Pre-processing Movie Review Dataset") 227 | 228 | parser.add_argument("--place", type=str, default="local", 229 | help="decide the location of LTP and data") 230 | 231 | parser.add_argument("--has_valid", action="store_true", 232 | help="whether have 'real' validation data for tuning the model") 233 | 234 | my_args = parser.parse_args() 235 | 236 | # for fn in ["data/mr/MR.task.train","data/mr/MR.task.test"]: 237 | # sentences, labels = read_mr_txt(fn) 238 | # yutils.write_list2file(sentences, fn+".sentences") 239 | # yutils.write_list2file(labels, fn+".labels") 240 | processing(my_args) 241 | 242 | -------------------------------------------------------------------------------- /yutils.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | A file for utilities used in other files. 4 | Methods: 5 | segment_words 6 | 7 | """ 8 | import sys 9 | import random 10 | 11 | import numpy as np 12 | 13 | reload(sys) 14 | sys.setdefaultencoding('utf-8') 15 | random.seed(1) 16 | 17 | 18 | ############### 19 | # English pre-processing 20 | ############### 21 | def tokenize_sentence(senlist, choice="string"): 22 | import nltk 23 | tokenized_sen = [] 24 | if choice == "string": 25 | for s in senlist: 26 | s = s.replace(" #SemST", "") # remove those irrelevant tags 27 | s = s.lower() 28 | tokens = nltk.word_tokenize(s) 29 | # print type(tokens) 30 | tokens = list2string(tokens) 31 | tokenized_sen.append(tokens) 32 | else: 33 | for s in senlist: 34 | tokens = nltk.word_tokenize(s) 35 | tokens = list(tokens) 36 | tokenized_sen.append(tokens) 37 | 38 | return tokenized_sen 39 | 40 | 41 | ############### 42 | # String Utilities 43 | ############### 44 | def list2string(list_of_words, has_blank=True): 45 | """covert list of segment words into a single string""" 46 | l = list_of_words 47 | s = "" 48 | if has_blank: 49 | for i in l: 50 | if i not in set(["\n", " ", "\n\n"]): 51 | s += i + " " 52 | else: 53 | for i in l: 54 | if i != "\n" and i != " " and i != "\n\n": 55 | s += i 56 | return s 57 | 58 | 59 | def string2list(sentence_in_string): 60 | """convert strings with '\n' to list of words without '\n' """ 61 | return sentence_in_string.strip().split() # remove last \n 62 | 63 | 64 | # contents is a list of Strings 65 | def write_list2file(contents, filename): 66 | s = '' 67 | for i in contents: 68 | s += (str(i) + "\n") 69 | with open(filename, 'w') as f: 70 | f.write(s) 71 | print "********** Write to file Successfully" 72 | 73 | 74 | # read raw text into list (sentence in strings) 75 | def read_file2list(filename): 76 | contents = [] 77 | with open(filename, 'r') as f: 78 | contents = [line.split("\n")[0] for line in f] 79 | print "The file has lines: ", len(contents) 80 | return contents 81 | 82 | 83 | # read segmented corpus into list (sentence in list of words) 84 | def read_file2lol(filename): 85 | with open(filename, 'r') as f: 86 | contents = [string2list(line) for line in f] 87 | print "The file has lines: ", len(contents) 88 | return contents 89 | 90 | 91 | # read raw text (seged or tokenized) and get average length of the strings 92 | def avg_str_len(filename): 93 | contents = read_file2lol(filename) 94 | num_sentences = len(contents) 95 | len_list = [len(sen) for sen in contents] 96 | num_words = sum(len_list) 97 | words_per_sen = 1.0 * num_words / num_sentences 98 | print "%d sentences have %d words, avg=%f" % (num_sentences, num_words, words_per_sen) 99 | print "max length = %d min length = %d" % (max(len_list), min(len_list)) 100 | return words_per_sen 101 | 102 | 103 | ################### 104 | # Serialization to pickle 105 | ################### 106 | def dict2pickle(your_dict, out_file): 107 | try: 108 | import cPickle as pickle 109 | except ImportError: 110 | import pickle 111 | with open(out_file, 'wb') as f: 112 | pickle.dump(your_dict, f) 113 | 114 | 115 | def pickle2dict(in_file): 116 | try: 117 | import cPickle as pickle 118 | except ImportError: 119 | import pickle 120 | with open(in_file, 'r') as f: 121 | your_dict = pickle.load(f) 122 | return your_dict 123 | 124 | 125 | def cal_word_freq(corpus, input_format="listoflist"): 126 | """ 127 | arg: the list of sentence(list of segmented word) 128 | 129 | :return: frequency of given corpus 130 | """ 131 | if input_format != "listoflist": 132 | corpus = [string2list(i) for i in corpus] 133 | freq = dict() 134 | for sentence in corpus: 135 | for word in sentence: 136 | if word not in freq: 137 | freq[word] = 1 138 | freq[word] += 1 139 | result = [[freq[word], word] for word in freq] 140 | revert_result = sorted(result, key=lambda d:d[0], reverse=True) 141 | print "The word freq of given corpus" 142 | for i in revert_result: 143 | print i[0], i[1] 144 | return [str(i[0]) + " " + str(i[1]) + "\n" for i in revert_result] 145 | 146 | 147 | def shuffle(lol, seed=1234567890): 148 | """ 149 | lol :: list of list as input 150 | seed :: seed the shuffling 151 | 152 | shuffle inplace each list in the same order 153 | """ 154 | for l in lol: 155 | random.seed(seed) 156 | random.shuffle(l) 157 | 158 | 159 | def cal_prf(pred, right, gold, formation=True, metric_type=""): 160 | """ 161 | :param pred: predicted labels 162 | :param right: predicting right labels 163 | :param gold: gold labels 164 | :param formation: whether format the float to 6 digits 165 | :param metric_type: 166 | :return: prf for each label 167 | """ 168 | ''' Pred: [0, 2905, 0] Right: [0, 2083, 0] Gold: [370, 2083, 452] ''' 169 | num_class = len(pred) 170 | precision = [0.0] * num_class 171 | recall = [0.0] * num_class 172 | f1_score = [0.0] * num_class 173 | 174 | for i in xrange(num_class): 175 | ''' cal precision for each class: right / predict ''' 176 | precision[i] = 0 if pred[i] == 0 else 1.0 * right[i] / pred[i] 177 | 178 | ''' cal recall for each class: right / gold ''' 179 | recall[i] = 0 if gold[i] == 0 else 1.0 * right[i] / gold[i] 180 | 181 | ''' cal recall for each class: 2 pr / (p+r) ''' 182 | f1_score[i] = 0 if precision[i] == 0 or recall[i] == 0 \ 183 | else 2.0 * (precision[i] * recall[i]) / (precision[i] + recall[i]) 184 | 185 | if formation: 186 | precision[i] = precision[i].__format__(".6f") 187 | recall[i] = recall[i].__format__(".6f") 188 | f1_score[i] = f1_score[i].__format__(".6f") 189 | 190 | ''' PRF for each label or PRF for all labels ''' 191 | if metric_type == "macro": 192 | precision = sum(precision) / len(precision) 193 | recall = sum(recall) / len(recall) 194 | f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 195 | elif metric_type == "micro": 196 | precision = 1.0 * sum(right) / sum(pred) if sum(pred) > 0 else 0 197 | recall = 1.0 * sum(right) / sum(gold) if sum(recall) > 0 else 0 198 | f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 199 | 200 | return precision, recall, f1_score 201 | 202 | 203 | ################# 204 | # Padding, Mask Matrix and NextBatch training 205 | ################# 206 | 207 | 208 | def get_padding(sentences, max_len): 209 | """ 210 | :param sentences: raw sentence --> index_padded sentence 211 | [2, 3, 4], 5 --> [2, 3, 4, 0, 0] 212 | :param max_len: number of steps to unroll for a LSTM 213 | :return: sentence of max_len size with zero paddings 214 | """ 215 | seq_len = np.zeros((0,)) 216 | padded = np.zeros((0, max_len)) 217 | for sentence in sentences: 218 | num_words = len(sentence) 219 | num_pad = max_len - num_words 220 | ''' Answer 60=45+15''' 221 | if max_len == 60 and num_words > 60: 222 | sentence = sentence[:45] + sentence[num_words-15:] 223 | sentence = np.asarray(sentence, dtype=np.int64).reshape(1, -1) 224 | else: 225 | sentence = np.asarray(sentence[:max_len], dtype=np.int64).reshape(1, -1) 226 | if num_pad > 0: 227 | zero_paddings = np.zeros((1, num_pad), dtype=np.int64) 228 | sentence = np.concatenate((sentence, zero_paddings), axis=1) 229 | else: 230 | num_words = max_len 231 | 232 | padded = np.concatenate((padded, sentence), axis=0) 233 | seq_len = np.concatenate((seq_len, [num_words])) 234 | return padded.astype(np.int64), seq_len.astype(np.int64) 235 | 236 | 237 | def get_mask_matrix(seq_lengths, max_len): 238 | """ 239 | [5, 2, 4,... 7], 10 --> 240 | [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], 241 | ..., 242 | [1, 1, 1, 1, 1, 1, 1, 0, 0, 0] 243 | ] 244 | :param seq_lengths: 245 | :param max_len: 246 | :return: 247 | """ 248 | mask_matrix = np.ones((0, max_len)) 249 | for seq_len in seq_lengths: 250 | num_mask = max_len - seq_len 251 | mask = np.ones((1, seq_len), dtype=np.int64) 252 | if num_mask > 0: 253 | zero_paddings = np.zeros((1, num_mask), dtype=np.int64) 254 | mask = np.concatenate((mask, zero_paddings), axis=1) 255 | mask_matrix = np.concatenate((mask_matrix, mask), axis=0) 256 | 257 | return mask_matrix.astype(np.int64) 258 | 259 | 260 | class YDataset(object): 261 | def __init__(self, features, labels, to_pad=True, max_len=40): 262 | """ 263 | All sentences are indexes of words! 264 | :param features: list containing sequences to be padded and batched 265 | :param labels: 266 | """ 267 | self.features = features 268 | self.labels = labels 269 | self.pad_max_len = max_len 270 | self.seq_lens = None 271 | self.mask_matrix = None 272 | 273 | assert len(features) == len(self.labels) 274 | 275 | self._num_examples = len(self.labels) 276 | self._epochs_completed = 0 277 | self._index_in_epoch = 0 278 | 279 | if to_pad: 280 | if max_len: 281 | self._padding() 282 | self._mask() 283 | else: 284 | print "Need more information about padding max_length" 285 | 286 | def __len__(self): 287 | return self._num_examples 288 | 289 | @property 290 | def epochs_completed(self): 291 | return self._epochs_completed 292 | 293 | def _padding(self): 294 | self.features, self.seq_lens = get_padding(self.features, max_len=self.pad_max_len) 295 | 296 | def _mask(self): 297 | self.mask_matrix = get_mask_matrix(self.seq_lens, max_len=self.pad_max_len) 298 | 299 | def _shuffle(self, seed): 300 | """ 301 | After each epoch, the data need to be shuffled 302 | :return: 303 | """ 304 | perm = np.arange(self._num_examples) 305 | np.random.shuffle(perm) 306 | 307 | self.features = self.features[perm] 308 | self.seq_lens = self.seq_lens[perm] 309 | self.mask_matrix = self.mask_matrix[perm] 310 | self.labels = self.labels[perm] 311 | 312 | def next_batch(self, batch_size, seed=123456): 313 | """Return the next `batch_size` examples from this data set.""" 314 | start = self._index_in_epoch 315 | self._index_in_epoch += batch_size 316 | if self._index_in_epoch > self._num_examples: 317 | # Finished epoch 318 | self._epochs_completed += 1 319 | ''' shuffle feature and labels''' 320 | self._shuffle(seed=seed) 321 | 322 | start = 0 323 | self._index_in_epoch = batch_size 324 | assert batch_size <= self._num_examples 325 | end = self._index_in_epoch 326 | 327 | features = self.features[start:end] 328 | seq_lens = self.seq_lens[start:end] 329 | mask_matrix = self.mask_matrix[start:end] 330 | labels = self.labels[start:end] 331 | 332 | return features, seq_lens, mask_matrix, labels 333 | 334 | 335 | if __name__ == "__main__": 336 | print "------------This is for utility test--------------" 337 | 338 | avg_str_len("data/mr/MR.task.test") 339 | --------------------------------------------------------------------------------