├── .DS_Store
├── LICENCE
├── README.md
├── __init__.py
├── data
    ├── .DS_Store
    └── mr
    │   ├── MR.task.test
    │   ├── MR.task.test.labels
    │   ├── MR.task.test.sentences
    │   ├── MR.task.train
    │   ├── MR.task.train.labels
    │   ├── MR.task.train.sentences
    │   └── MR.task.unlabel
├── evaluate_batch.py
├── main_batch.py
├── nnet
    ├── .DS_Store
    ├── __init__.py
    ├── blstm.py
    └── lstm.py
├── preprocessing.py
└── yutils.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/.DS_Store


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
  1 | Attribution-ShareAlike 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 | 	wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More_considerations
 52 |      for the public:
 53 | 	wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution-ShareAlike 4.0 International Public
 58 | License
 59 | 
 60 | By exercising the Licensed Rights (defined below), You accept and agree
 61 | to be bound by the terms and conditions of this Creative Commons
 62 | Attribution-ShareAlike 4.0 International Public License ("Public
 63 | License"). To the extent this Public License may be interpreted as a
 64 | contract, You are granted the Licensed Rights in consideration of Your
 65 | acceptance of these terms and conditions, and the Licensor grants You
 66 | such rights in consideration of benefits the Licensor receives from
 67 | making the Licensed Material available under these terms and
 68 | conditions.
 69 | 
 70 | 
 71 | Section 1 -- Definitions.
 72 | 
 73 |   a. Adapted Material means material subject to Copyright and Similar
 74 |      Rights that is derived from or based upon the Licensed Material
 75 |      and in which the Licensed Material is translated, altered,
 76 |      arranged, transformed, or otherwise modified in a manner requiring
 77 |      permission under the Copyright and Similar Rights held by the
 78 |      Licensor. For purposes of this Public License, where the Licensed
 79 |      Material is a musical work, performance, or sound recording,
 80 |      Adapted Material is always produced where the Licensed Material is
 81 |      synched in timed relation with a moving image.
 82 | 
 83 |   b. Adapter's License means the license You apply to Your Copyright
 84 |      and Similar Rights in Your contributions to Adapted Material in
 85 |      accordance with the terms and conditions of this Public License.
 86 | 
 87 |   c. BY-SA Compatible License means a license listed at
 88 |      creativecommons.org/compatiblelicenses, approved by Creative
 89 |      Commons as essentially the equivalent of this Public License.
 90 | 
 91 |   d. Copyright and Similar Rights means copyright and/or similar rights
 92 |      closely related to copyright including, without limitation,
 93 |      performance, broadcast, sound recording, and Sui Generis Database
 94 |      Rights, without regard to how the rights are labeled or
 95 |      categorized. For purposes of this Public License, the rights
 96 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 97 |      Rights.
 98 | 
 99 |   e. Effective Technological Measures means those measures that, in the
100 |      absence of proper authority, may not be circumvented under laws
101 |      fulfilling obligations under Article 11 of the WIPO Copyright
102 |      Treaty adopted on December 20, 1996, and/or similar international
103 |      agreements.
104 | 
105 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
106 |      any other exception or limitation to Copyright and Similar Rights
107 |      that applies to Your use of the Licensed Material.
108 | 
109 |   g. License Elements means the license attributes listed in the name
110 |      of a Creative Commons Public License. The License Elements of this
111 |      Public License are Attribution and ShareAlike.
112 | 
113 |   h. Licensed Material means the artistic or literary work, database,
114 |      or other material to which the Licensor applied this Public
115 |      License.
116 | 
117 |   i. Licensed Rights means the rights granted to You subject to the
118 |      terms and conditions of this Public License, which are limited to
119 |      all Copyright and Similar Rights that apply to Your use of the
120 |      Licensed Material and that the Licensor has authority to license.
121 | 
122 |   j. Licensor means the individual(s) or entity(ies) granting rights
123 |      under this Public License.
124 | 
125 |   k. Share means to provide material to the public by any means or
126 |      process that requires permission under the Licensed Rights, such
127 |      as reproduction, public display, public performance, distribution,
128 |      dissemination, communication, or importation, and to make material
129 |      available to the public including in ways that members of the
130 |      public may access the material from a place and at a time
131 |      individually chosen by them.
132 | 
133 |   l. Sui Generis Database Rights means rights other than copyright
134 |      resulting from Directive 96/9/EC of the European Parliament and of
135 |      the Council of 11 March 1996 on the legal protection of databases,
136 |      as amended and/or succeeded, as well as other essentially
137 |      equivalent rights anywhere in the world.
138 | 
139 |   m. You means the individual or entity exercising the Licensed Rights
140 |      under this Public License. Your has a corresponding meaning.
141 | 
142 | 
143 | Section 2 -- Scope.
144 | 
145 |   a. License grant.
146 | 
147 |        1. Subject to the terms and conditions of this Public License,
148 |           the Licensor hereby grants You a worldwide, royalty-free,
149 |           non-sublicensable, non-exclusive, irrevocable license to
150 |           exercise the Licensed Rights in the Licensed Material to:
151 | 
152 |             a. reproduce and Share the Licensed Material, in whole or
153 |                in part; and
154 | 
155 |             b. produce, reproduce, and Share Adapted Material.
156 | 
157 |        2. Exceptions and Limitations. For the avoidance of doubt, where
158 |           Exceptions and Limitations apply to Your use, this Public
159 |           License does not apply, and You do not need to comply with
160 |           its terms and conditions.
161 | 
162 |        3. Term. The term of this Public License is specified in Section
163 |           6(a).
164 | 
165 |        4. Media and formats; technical modifications allowed. The
166 |           Licensor authorizes You to exercise the Licensed Rights in
167 |           all media and formats whether now known or hereafter created,
168 |           and to make technical modifications necessary to do so. The
169 |           Licensor waives and/or agrees not to assert any right or
170 |           authority to forbid You from making technical modifications
171 |           necessary to exercise the Licensed Rights, including
172 |           technical modifications necessary to circumvent Effective
173 |           Technological Measures. For purposes of this Public License,
174 |           simply making modifications authorized by this Section 2(a)
175 |           (4) never produces Adapted Material.
176 | 
177 |        5. Downstream recipients.
178 | 
179 |             a. Offer from the Licensor -- Licensed Material. Every
180 |                recipient of the Licensed Material automatically
181 |                receives an offer from the Licensor to exercise the
182 |                Licensed Rights under the terms and conditions of this
183 |                Public License.
184 | 
185 |             b. Additional offer from the Licensor -- Adapted Material.
186 |                Every recipient of Adapted Material from You
187 |                automatically receives an offer from the Licensor to
188 |                exercise the Licensed Rights in the Adapted Material
189 |                under the conditions of the Adapter's License You apply.
190 | 
191 |             c. No downstream restrictions. You may not offer or impose
192 |                any additional or different terms or conditions on, or
193 |                apply any Effective Technological Measures to, the
194 |                Licensed Material if doing so restricts exercise of the
195 |                Licensed Rights by any recipient of the Licensed
196 |                Material.
197 | 
198 |        6. No endorsement. Nothing in this Public License constitutes or
199 |           may be construed as permission to assert or imply that You
200 |           are, or that Your use of the Licensed Material is, connected
201 |           with, or sponsored, endorsed, or granted official status by,
202 |           the Licensor or others designated to receive attribution as
203 |           provided in Section 3(a)(1)(A)(i).
204 | 
205 |   b. Other rights.
206 | 
207 |        1. Moral rights, such as the right of integrity, are not
208 |           licensed under this Public License, nor are publicity,
209 |           privacy, and/or other similar personality rights; however, to
210 |           the extent possible, the Licensor waives and/or agrees not to
211 |           assert any such rights held by the Licensor to the limited
212 |           extent necessary to allow You to exercise the Licensed
213 |           Rights, but not otherwise.
214 | 
215 |        2. Patent and trademark rights are not licensed under this
216 |           Public License.
217 | 
218 |        3. To the extent possible, the Licensor waives any right to
219 |           collect royalties from You for the exercise of the Licensed
220 |           Rights, whether directly or through a collecting society
221 |           under any voluntary or waivable statutory or compulsory
222 |           licensing scheme. In all other cases the Licensor expressly
223 |           reserves any right to collect such royalties.
224 | 
225 | 
226 | Section 3 -- License Conditions.
227 | 
228 | Your exercise of the Licensed Rights is expressly made subject to the
229 | following conditions.
230 | 
231 |   a. Attribution.
232 | 
233 |        1. If You Share the Licensed Material (including in modified
234 |           form), You must:
235 | 
236 |             a. retain the following if it is supplied by the Licensor
237 |                with the Licensed Material:
238 | 
239 |                  i. identification of the creator(s) of the Licensed
240 |                     Material and any others designated to receive
241 |                     attribution, in any reasonable manner requested by
242 |                     the Licensor (including by pseudonym if
243 |                     designated);
244 | 
245 |                 ii. a copyright notice;
246 | 
247 |                iii. a notice that refers to this Public License;
248 | 
249 |                 iv. a notice that refers to the disclaimer of
250 |                     warranties;
251 | 
252 |                  v. a URI or hyperlink to the Licensed Material to the
253 |                     extent reasonably practicable;
254 | 
255 |             b. indicate if You modified the Licensed Material and
256 |                retain an indication of any previous modifications; and
257 | 
258 |             c. indicate the Licensed Material is licensed under this
259 |                Public License, and include the text of, or the URI or
260 |                hyperlink to, this Public License.
261 | 
262 |        2. You may satisfy the conditions in Section 3(a)(1) in any
263 |           reasonable manner based on the medium, means, and context in
264 |           which You Share the Licensed Material. For example, it may be
265 |           reasonable to satisfy the conditions by providing a URI or
266 |           hyperlink to a resource that includes the required
267 |           information.
268 | 
269 |        3. If requested by the Licensor, You must remove any of the
270 |           information required by Section 3(a)(1)(A) to the extent
271 |           reasonably practicable.
272 | 
273 |   b. ShareAlike.
274 | 
275 |      In addition to the conditions in Section 3(a), if You Share
276 |      Adapted Material You produce, the following conditions also apply.
277 | 
278 |        1. The Adapter's License You apply must be a Creative Commons
279 |           license with the same License Elements, this version or
280 |           later, or a BY-SA Compatible License.
281 | 
282 |        2. You must include the text of, or the URI or hyperlink to, the
283 |           Adapter's License You apply. You may satisfy this condition
284 |           in any reasonable manner based on the medium, means, and
285 |           context in which You Share Adapted Material.
286 | 
287 |        3. You may not offer or impose any additional or different terms
288 |           or conditions on, or apply any Effective Technological
289 |           Measures to, Adapted Material that restrict exercise of the
290 |           rights granted under the Adapter's License You apply.
291 | 
292 | 
293 | Section 4 -- Sui Generis Database Rights.
294 | 
295 | Where the Licensed Rights include Sui Generis Database Rights that
296 | apply to Your use of the Licensed Material:
297 | 
298 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
299 |      to extract, reuse, reproduce, and Share all or a substantial
300 |      portion of the contents of the database;
301 | 
302 |   b. if You include all or a substantial portion of the database
303 |      contents in a database in which You have Sui Generis Database
304 |      Rights, then the database in which You have Sui Generis Database
305 |      Rights (but not its individual contents) is Adapted Material,
306 | 
307 |      including for purposes of Section 3(b); and
308 |   c. You must comply with the conditions in Section 3(a) if You Share
309 |      all or a substantial portion of the contents of the database.
310 | 
311 | For the avoidance of doubt, this Section 4 supplements and does not
312 | replace Your obligations under this Public License where the Licensed
313 | Rights include other Copyright and Similar Rights.
314 | 
315 | 
316 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
317 | 
318 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
319 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
320 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
321 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
322 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
323 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
324 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
325 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
326 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
327 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
328 | 
329 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
330 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
331 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
332 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
333 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
334 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
335 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
336 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
337 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
338 | 
339 |   c. The disclaimer of warranties and limitation of liability provided
340 |      above shall be interpreted in a manner that, to the extent
341 |      possible, most closely approximates an absolute disclaimer and
342 |      waiver of all liability.
343 | 
344 | 
345 | Section 6 -- Term and Termination.
346 | 
347 |   a. This Public License applies for the term of the Copyright and
348 |      Similar Rights licensed here. However, if You fail to comply with
349 |      this Public License, then Your rights under this Public License
350 |      terminate automatically.
351 | 
352 |   b. Where Your right to use the Licensed Material has terminated under
353 |      Section 6(a), it reinstates:
354 | 
355 |        1. automatically as of the date the violation is cured, provided
356 |           it is cured within 30 days of Your discovery of the
357 |           violation; or
358 | 
359 |        2. upon express reinstatement by the Licensor.
360 | 
361 |      For the avoidance of doubt, this Section 6(b) does not affect any
362 |      right the Licensor may have to seek remedies for Your violations
363 |      of this Public License.
364 | 
365 |   c. For the avoidance of doubt, the Licensor may also offer the
366 |      Licensed Material under separate terms or conditions or stop
367 |      distributing the Licensed Material at any time; however, doing so
368 |      will not terminate this Public License.
369 | 
370 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
371 |      License.
372 | 
373 | 
374 | Section 7 -- Other Terms and Conditions.
375 | 
376 |   a. The Licensor shall not be bound by any additional or different
377 |      terms or conditions communicated by You unless expressly agreed.
378 | 
379 |   b. Any arrangements, understandings, or agreements regarding the
380 |      Licensed Material not stated herein are separate from and
381 |      independent of the terms and conditions of this Public License.
382 | 
383 | 
384 | Section 8 -- Interpretation.
385 | 
386 |   a. For the avoidance of doubt, this Public License does not, and
387 |      shall not be interpreted to, reduce, limit, restrict, or impose
388 |      conditions on any use of the Licensed Material that could lawfully
389 |      be made without permission under this Public License.
390 | 
391 |   b. To the extent possible, if any provision of this Public License is
392 |      deemed unenforceable, it shall be automatically reformed to the
393 |      minimum extent necessary to make it enforceable. If the provision
394 |      cannot be reformed, it shall be severed from this Public License
395 |      without affecting the enforceability of the remaining terms and
396 |      conditions.
397 | 
398 |   c. No term or condition of this Public License will be waived and no
399 |      failure to comply consented to unless expressly agreed to by the
400 |      Licensor.
401 | 
402 |   d. Nothing in this Public License constitutes or may be interpreted
403 |      as a limitation upon, or waiver of, any privileges and immunities
404 |      that apply to the Licensor or You, including from the legal
405 |      processes of any jurisdiction or authority.
406 | 
407 | 
408 | =======================================================================
409 | 
410 | Creative Commons is not a party to its public
411 | licenses. Notwithstanding, Creative Commons may elect to apply one of
412 | its public licenses to material it publishes and in those instances
413 | will be considered the “Licensor.” The text of the Creative Commons
414 | public licenses is dedicated to the public domain under the CC0 Public
415 | Domain Dedication. Except for the limited purpose of indicating that
416 | material is shared under a Creative Commons public license or as
417 | otherwise permitted by the Creative Commons policies published at
418 | creativecommons.org/policies, Creative Commons does not authorize the
419 | use of the trademark "Creative Commons" or any other trademark or logo
420 | of Creative Commons without its prior written consent including,
421 | without limitation, in connection with any unauthorized modifications
422 | to any of its public licenses or any other arrangements,
423 | understandings, or agreements concerning use of licensed material. For
424 | the avoidance of doubt, this paragraph does not form part of the
425 | public licenses.
426 | 
427 | Creative Commons may be contacted at creativecommons.org.
428 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 | ###  The code is created by JianhuaYuan
3 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/__init__.py


--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/.DS_Store


--------------------------------------------------------------------------------
/data/mr/MR.task.test:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/mr/MR.task.test


--------------------------------------------------------------------------------
/data/mr/MR.task.test.labels:
--------------------------------------------------------------------------------
  1 | 0
  2 | 0
  3 | 0
  4 | 1
  5 | 0
  6 | 1
  7 | 1
  8 | 0
  9 | 1
 10 | 1
 11 | 0
 12 | 1
 13 | 0
 14 | 0
 15 | 1
 16 | 0
 17 | 0
 18 | 0
 19 | 1
 20 | 0
 21 | 1
 22 | 0
 23 | 1
 24 | 0
 25 | 1
 26 | 1
 27 | 0
 28 | 1
 29 | 0
 30 | 1
 31 | 1
 32 | 0
 33 | 0
 34 | 1
 35 | 0
 36 | 1
 37 | 1
 38 | 0
 39 | 0
 40 | 0
 41 | 0
 42 | 0
 43 | 0
 44 | 0
 45 | 1
 46 | 1
 47 | 0
 48 | 0
 49 | 0
 50 | 1
 51 | 1
 52 | 1
 53 | 1
 54 | 0
 55 | 0
 56 | 1
 57 | 1
 58 | 0
 59 | 1
 60 | 0
 61 | 1
 62 | 0
 63 | 1
 64 | 0
 65 | 0
 66 | 1
 67 | 1
 68 | 1
 69 | 1
 70 | 1
 71 | 1
 72 | 0
 73 | 0
 74 | 1
 75 | 0
 76 | 0
 77 | 0
 78 | 1
 79 | 0
 80 | 1
 81 | 1
 82 | 1
 83 | 1
 84 | 0
 85 | 1
 86 | 1
 87 | 0
 88 | 1
 89 | 0
 90 | 0
 91 | 1
 92 | 1
 93 | 1
 94 | 0
 95 | 1
 96 | 1
 97 | 1
 98 | 0
 99 | 0
100 | 1
101 | 1
102 | 0
103 | 1
104 | 1
105 | 1
106 | 1
107 | 0
108 | 1
109 | 1
110 | 1
111 | 1
112 | 0
113 | 1
114 | 1
115 | 1
116 | 0
117 | 1
118 | 1
119 | 0
120 | 0
121 | 1
122 | 0
123 | 1
124 | 0
125 | 0
126 | 1
127 | 1
128 | 1
129 | 0
130 | 0
131 | 0
132 | 1
133 | 0
134 | 1
135 | 1
136 | 1
137 | 0
138 | 0
139 | 1
140 | 0
141 | 0
142 | 0
143 | 1
144 | 1
145 | 1
146 | 0
147 | 1
148 | 0
149 | 0
150 | 1
151 | 1
152 | 1
153 | 0
154 | 0
155 | 0
156 | 0
157 | 0
158 | 0
159 | 0
160 | 0
161 | 0
162 | 1
163 | 1
164 | 1
165 | 1
166 | 0
167 | 1
168 | 1
169 | 0
170 | 0
171 | 1
172 | 0
173 | 1
174 | 0
175 | 0
176 | 1
177 | 1
178 | 0
179 | 0
180 | 1
181 | 0
182 | 1
183 | 0
184 | 1
185 | 0
186 | 1
187 | 1
188 | 1
189 | 1
190 | 1
191 | 1
192 | 0
193 | 0
194 | 0
195 | 1
196 | 1
197 | 1
198 | 0
199 | 1
200 | 1
201 | 0
202 | 0
203 | 0
204 | 0
205 | 1
206 | 1
207 | 0
208 | 0
209 | 1
210 | 1
211 | 1
212 | 0
213 | 0
214 | 1
215 | 1
216 | 0
217 | 1
218 | 1
219 | 1
220 | 1
221 | 0
222 | 0
223 | 0
224 | 1
225 | 1
226 | 0
227 | 1
228 | 0
229 | 0
230 | 1
231 | 0
232 | 0
233 | 1
234 | 1
235 | 0
236 | 0
237 | 1
238 | 1
239 | 0
240 | 1
241 | 0
242 | 0
243 | 1
244 | 1
245 | 0
246 | 1
247 | 1
248 | 0
249 | 0
250 | 0
251 | 0
252 | 1
253 | 1
254 | 0
255 | 1
256 | 1
257 | 0
258 | 0
259 | 0
260 | 1
261 | 1
262 | 1
263 | 0
264 | 1
265 | 0
266 | 1
267 | 0
268 | 0
269 | 1
270 | 1
271 | 0
272 | 1
273 | 0
274 | 1
275 | 1
276 | 0
277 | 0
278 | 0
279 | 1
280 | 0
281 | 1
282 | 0
283 | 1
284 | 1
285 | 0
286 | 1
287 | 0
288 | 1
289 | 0
290 | 0
291 | 0
292 | 0
293 | 0
294 | 1
295 | 1
296 | 1
297 | 0
298 | 0
299 | 1
300 | 1
301 | 1
302 | 0
303 | 1
304 | 1
305 | 0
306 | 0
307 | 0
308 | 1
309 | 0
310 | 1
311 | 0
312 | 0
313 | 1
314 | 1
315 | 1
316 | 0
317 | 1
318 | 0
319 | 1
320 | 1
321 | 1
322 | 1
323 | 0
324 | 0
325 | 0
326 | 0
327 | 0
328 | 1
329 | 0
330 | 1
331 | 0
332 | 1
333 | 0
334 | 0
335 | 1
336 | 1
337 | 1
338 | 0
339 | 0
340 | 1
341 | 0
342 | 0
343 | 1
344 | 1
345 | 1
346 | 1
347 | 1
348 | 1
349 | 1
350 | 0
351 | 0
352 | 0
353 | 0
354 | 0
355 | 1
356 | 1
357 | 1
358 | 1
359 | 1
360 | 1
361 | 1
362 | 1
363 | 0
364 | 1
365 | 1
366 | 1
367 | 0
368 | 0
369 | 0
370 | 0
371 | 1
372 | 0
373 | 0
374 | 1
375 | 0
376 | 0
377 | 1
378 | 0
379 | 0
380 | 0
381 | 1
382 | 0
383 | 1
384 | 0
385 | 0
386 | 1
387 | 0
388 | 0
389 | 1
390 | 1
391 | 1
392 | 1
393 | 0
394 | 0
395 | 1
396 | 1
397 | 0
398 | 1
399 | 1
400 | 1
401 | 


--------------------------------------------------------------------------------
/data/mr/MR.task.test.sentences:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/mr/MR.task.test.sentences


--------------------------------------------------------------------------------
/data/mr/MR.task.train:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/mr/MR.task.train


--------------------------------------------------------------------------------
/data/mr/MR.task.train.labels:
--------------------------------------------------------------------------------
   1 | 1
   2 | 1
   3 | 1
   4 | 1
   5 | 1
   6 | 0
   7 | 0
   8 | 0
   9 | 1
  10 | 1
  11 | 1
  12 | 0
  13 | 1
  14 | 0
  15 | 0
  16 | 0
  17 | 1
  18 | 0
  19 | 0
  20 | 0
  21 | 1
  22 | 1
  23 | 1
  24 | 0
  25 | 1
  26 | 1
  27 | 0
  28 | 0
  29 | 1
  30 | 0
  31 | 0
  32 | 1
  33 | 0
  34 | 0
  35 | 1
  36 | 0
  37 | 1
  38 | 0
  39 | 0
  40 | 0
  41 | 0
  42 | 0
  43 | 1
  44 | 1
  45 | 0
  46 | 1
  47 | 1
  48 | 0
  49 | 0
  50 | 1
  51 | 0
  52 | 0
  53 | 0
  54 | 1
  55 | 1
  56 | 0
  57 | 1
  58 | 1
  59 | 0
  60 | 1
  61 | 1
  62 | 1
  63 | 1
  64 | 1
  65 | 1
  66 | 0
  67 | 1
  68 | 0
  69 | 0
  70 | 0
  71 | 0
  72 | 1
  73 | 0
  74 | 1
  75 | 1
  76 | 0
  77 | 1
  78 | 1
  79 | 1
  80 | 1
  81 | 0
  82 | 0
  83 | 0
  84 | 0
  85 | 0
  86 | 1
  87 | 1
  88 | 0
  89 | 0
  90 | 0
  91 | 1
  92 | 1
  93 | 0
  94 | 1
  95 | 1
  96 | 1
  97 | 0
  98 | 0
  99 | 1
 100 | 0
 101 | 0
 102 | 0
 103 | 1
 104 | 0
 105 | 0
 106 | 0
 107 | 0
 108 | 1
 109 | 1
 110 | 1
 111 | 1
 112 | 1
 113 | 0
 114 | 0
 115 | 1
 116 | 0
 117 | 1
 118 | 0
 119 | 0
 120 | 0
 121 | 0
 122 | 0
 123 | 0
 124 | 1
 125 | 1
 126 | 0
 127 | 0
 128 | 0
 129 | 1
 130 | 1
 131 | 0
 132 | 1
 133 | 1
 134 | 0
 135 | 1
 136 | 1
 137 | 0
 138 | 0
 139 | 1
 140 | 0
 141 | 1
 142 | 0
 143 | 1
 144 | 0
 145 | 0
 146 | 0
 147 | 1
 148 | 0
 149 | 1
 150 | 0
 151 | 1
 152 | 1
 153 | 0
 154 | 0
 155 | 1
 156 | 1
 157 | 1
 158 | 1
 159 | 0
 160 | 0
 161 | 0
 162 | 1
 163 | 0
 164 | 0
 165 | 1
 166 | 1
 167 | 0
 168 | 0
 169 | 0
 170 | 0
 171 | 0
 172 | 1
 173 | 0
 174 | 0
 175 | 1
 176 | 0
 177 | 0
 178 | 0
 179 | 0
 180 | 0
 181 | 0
 182 | 0
 183 | 0
 184 | 1
 185 | 1
 186 | 0
 187 | 0
 188 | 1
 189 | 0
 190 | 1
 191 | 0
 192 | 0
 193 | 0
 194 | 0
 195 | 0
 196 | 0
 197 | 0
 198 | 1
 199 | 1
 200 | 0
 201 | 1
 202 | 0
 203 | 0
 204 | 1
 205 | 1
 206 | 0
 207 | 0
 208 | 1
 209 | 0
 210 | 0
 211 | 0
 212 | 1
 213 | 1
 214 | 0
 215 | 0
 216 | 0
 217 | 0
 218 | 1
 219 | 1
 220 | 0
 221 | 0
 222 | 1
 223 | 0
 224 | 1
 225 | 1
 226 | 1
 227 | 1
 228 | 0
 229 | 0
 230 | 0
 231 | 0
 232 | 0
 233 | 1
 234 | 0
 235 | 1
 236 | 1
 237 | 0
 238 | 0
 239 | 0
 240 | 1
 241 | 1
 242 | 1
 243 | 0
 244 | 0
 245 | 1
 246 | 0
 247 | 0
 248 | 0
 249 | 0
 250 | 1
 251 | 1
 252 | 1
 253 | 1
 254 | 0
 255 | 0
 256 | 1
 257 | 0
 258 | 0
 259 | 1
 260 | 1
 261 | 0
 262 | 0
 263 | 1
 264 | 0
 265 | 0
 266 | 0
 267 | 0
 268 | 0
 269 | 1
 270 | 1
 271 | 1
 272 | 1
 273 | 0
 274 | 0
 275 | 1
 276 | 0
 277 | 0
 278 | 1
 279 | 0
 280 | 1
 281 | 1
 282 | 0
 283 | 1
 284 | 1
 285 | 0
 286 | 0
 287 | 0
 288 | 1
 289 | 0
 290 | 0
 291 | 1
 292 | 0
 293 | 0
 294 | 1
 295 | 1
 296 | 0
 297 | 1
 298 | 0
 299 | 1
 300 | 0
 301 | 0
 302 | 1
 303 | 0
 304 | 0
 305 | 0
 306 | 0
 307 | 0
 308 | 1
 309 | 1
 310 | 1
 311 | 0
 312 | 1
 313 | 0
 314 | 1
 315 | 1
 316 | 0
 317 | 1
 318 | 1
 319 | 0
 320 | 0
 321 | 0
 322 | 1
 323 | 0
 324 | 0
 325 | 1
 326 | 1
 327 | 0
 328 | 1
 329 | 0
 330 | 0
 331 | 0
 332 | 1
 333 | 0
 334 | 1
 335 | 0
 336 | 0
 337 | 1
 338 | 1
 339 | 0
 340 | 1
 341 | 1
 342 | 0
 343 | 1
 344 | 1
 345 | 1
 346 | 1
 347 | 1
 348 | 1
 349 | 1
 350 | 1
 351 | 1
 352 | 0
 353 | 0
 354 | 0
 355 | 0
 356 | 1
 357 | 1
 358 | 0
 359 | 0
 360 | 1
 361 | 0
 362 | 1
 363 | 0
 364 | 0
 365 | 0
 366 | 1
 367 | 0
 368 | 0
 369 | 1
 370 | 1
 371 | 0
 372 | 0
 373 | 0
 374 | 1
 375 | 0
 376 | 0
 377 | 0
 378 | 1
 379 | 0
 380 | 0
 381 | 0
 382 | 0
 383 | 1
 384 | 0
 385 | 0
 386 | 1
 387 | 0
 388 | 0
 389 | 0
 390 | 0
 391 | 0
 392 | 0
 393 | 1
 394 | 1
 395 | 0
 396 | 0
 397 | 0
 398 | 0
 399 | 1
 400 | 0
 401 | 0
 402 | 0
 403 | 1
 404 | 0
 405 | 0
 406 | 1
 407 | 0
 408 | 0
 409 | 0
 410 | 1
 411 | 1
 412 | 0
 413 | 1
 414 | 1
 415 | 0
 416 | 0
 417 | 0
 418 | 1
 419 | 1
 420 | 0
 421 | 0
 422 | 1
 423 | 0
 424 | 0
 425 | 1
 426 | 1
 427 | 0
 428 | 1
 429 | 1
 430 | 1
 431 | 1
 432 | 1
 433 | 1
 434 | 0
 435 | 1
 436 | 0
 437 | 1
 438 | 1
 439 | 1
 440 | 0
 441 | 0
 442 | 0
 443 | 0
 444 | 1
 445 | 0
 446 | 0
 447 | 1
 448 | 0
 449 | 1
 450 | 1
 451 | 0
 452 | 1
 453 | 1
 454 | 0
 455 | 0
 456 | 0
 457 | 1
 458 | 0
 459 | 1
 460 | 1
 461 | 1
 462 | 1
 463 | 0
 464 | 1
 465 | 1
 466 | 0
 467 | 0
 468 | 1
 469 | 1
 470 | 1
 471 | 1
 472 | 1
 473 | 1
 474 | 0
 475 | 0
 476 | 0
 477 | 1
 478 | 0
 479 | 1
 480 | 0
 481 | 0
 482 | 1
 483 | 1
 484 | 1
 485 | 1
 486 | 0
 487 | 1
 488 | 1
 489 | 1
 490 | 1
 491 | 1
 492 | 1
 493 | 0
 494 | 0
 495 | 1
 496 | 1
 497 | 1
 498 | 0
 499 | 0
 500 | 0
 501 | 0
 502 | 0
 503 | 1
 504 | 0
 505 | 1
 506 | 0
 507 | 1
 508 | 1
 509 | 0
 510 | 1
 511 | 1
 512 | 1
 513 | 0
 514 | 0
 515 | 1
 516 | 0
 517 | 1
 518 | 0
 519 | 0
 520 | 0
 521 | 0
 522 | 0
 523 | 1
 524 | 1
 525 | 1
 526 | 1
 527 | 1
 528 | 1
 529 | 1
 530 | 1
 531 | 1
 532 | 0
 533 | 1
 534 | 1
 535 | 1
 536 | 1
 537 | 1
 538 | 1
 539 | 0
 540 | 1
 541 | 1
 542 | 1
 543 | 0
 544 | 1
 545 | 0
 546 | 0
 547 | 0
 548 | 1
 549 | 1
 550 | 1
 551 | 1
 552 | 0
 553 | 1
 554 | 1
 555 | 0
 556 | 0
 557 | 1
 558 | 0
 559 | 0
 560 | 0
 561 | 1
 562 | 0
 563 | 0
 564 | 0
 565 | 0
 566 | 0
 567 | 0
 568 | 0
 569 | 0
 570 | 1
 571 | 0
 572 | 0
 573 | 0
 574 | 1
 575 | 1
 576 | 1
 577 | 0
 578 | 0
 579 | 1
 580 | 0
 581 | 0
 582 | 0
 583 | 1
 584 | 1
 585 | 0
 586 | 0
 587 | 1
 588 | 0
 589 | 1
 590 | 0
 591 | 1
 592 | 1
 593 | 1
 594 | 0
 595 | 1
 596 | 1
 597 | 1
 598 | 1
 599 | 1
 600 | 1
 601 | 0
 602 | 0
 603 | 1
 604 | 1
 605 | 1
 606 | 1
 607 | 1
 608 | 1
 609 | 1
 610 | 0
 611 | 1
 612 | 1
 613 | 1
 614 | 0
 615 | 1
 616 | 1
 617 | 0
 618 | 0
 619 | 1
 620 | 1
 621 | 0
 622 | 0
 623 | 1
 624 | 1
 625 | 1
 626 | 1
 627 | 1
 628 | 1
 629 | 1
 630 | 0
 631 | 1
 632 | 1
 633 | 0
 634 | 1
 635 | 0
 636 | 1
 637 | 1
 638 | 1
 639 | 1
 640 | 0
 641 | 1
 642 | 0
 643 | 1
 644 | 1
 645 | 0
 646 | 1
 647 | 1
 648 | 0
 649 | 1
 650 | 0
 651 | 0
 652 | 1
 653 | 1
 654 | 1
 655 | 0
 656 | 1
 657 | 0
 658 | 1
 659 | 1
 660 | 0
 661 | 0
 662 | 0
 663 | 1
 664 | 1
 665 | 1
 666 | 0
 667 | 1
 668 | 0
 669 | 0
 670 | 1
 671 | 1
 672 | 0
 673 | 0
 674 | 1
 675 | 1
 676 | 0
 677 | 1
 678 | 0
 679 | 1
 680 | 1
 681 | 1
 682 | 1
 683 | 1
 684 | 0
 685 | 1
 686 | 1
 687 | 1
 688 | 0
 689 | 0
 690 | 0
 691 | 0
 692 | 1
 693 | 0
 694 | 1
 695 | 1
 696 | 0
 697 | 1
 698 | 1
 699 | 0
 700 | 1
 701 | 1
 702 | 1
 703 | 0
 704 | 0
 705 | 1
 706 | 1
 707 | 1
 708 | 1
 709 | 0
 710 | 0
 711 | 1
 712 | 1
 713 | 1
 714 | 0
 715 | 1
 716 | 0
 717 | 1
 718 | 1
 719 | 0
 720 | 0
 721 | 1
 722 | 0
 723 | 1
 724 | 1
 725 | 0
 726 | 0
 727 | 1
 728 | 0
 729 | 1
 730 | 0
 731 | 1
 732 | 0
 733 | 1
 734 | 1
 735 | 1
 736 | 1
 737 | 0
 738 | 0
 739 | 0
 740 | 1
 741 | 0
 742 | 0
 743 | 1
 744 | 1
 745 | 1
 746 | 0
 747 | 1
 748 | 1
 749 | 1
 750 | 1
 751 | 0
 752 | 0
 753 | 1
 754 | 0
 755 | 0
 756 | 1
 757 | 1
 758 | 1
 759 | 1
 760 | 1
 761 | 0
 762 | 1
 763 | 1
 764 | 0
 765 | 1
 766 | 0
 767 | 1
 768 | 0
 769 | 1
 770 | 0
 771 | 1
 772 | 1
 773 | 0
 774 | 0
 775 | 0
 776 | 0
 777 | 1
 778 | 0
 779 | 1
 780 | 0
 781 | 1
 782 | 1
 783 | 0
 784 | 0
 785 | 1
 786 | 1
 787 | 0
 788 | 1
 789 | 0
 790 | 1
 791 | 0
 792 | 1
 793 | 0
 794 | 0
 795 | 0
 796 | 1
 797 | 1
 798 | 1
 799 | 0
 800 | 0
 801 | 1
 802 | 1
 803 | 1
 804 | 0
 805 | 0
 806 | 0
 807 | 1
 808 | 0
 809 | 1
 810 | 1
 811 | 1
 812 | 0
 813 | 1
 814 | 0
 815 | 1
 816 | 1
 817 | 0
 818 | 0
 819 | 1
 820 | 0
 821 | 0
 822 | 0
 823 | 1
 824 | 1
 825 | 0
 826 | 1
 827 | 0
 828 | 1
 829 | 0
 830 | 0
 831 | 0
 832 | 1
 833 | 1
 834 | 1
 835 | 0
 836 | 1
 837 | 0
 838 | 0
 839 | 1
 840 | 1
 841 | 1
 842 | 1
 843 | 0
 844 | 0
 845 | 1
 846 | 1
 847 | 0
 848 | 0
 849 | 0
 850 | 1
 851 | 1
 852 | 1
 853 | 0
 854 | 1
 855 | 0
 856 | 0
 857 | 1
 858 | 0
 859 | 0
 860 | 1
 861 | 1
 862 | 0
 863 | 1
 864 | 1
 865 | 0
 866 | 0
 867 | 0
 868 | 1
 869 | 0
 870 | 0
 871 | 0
 872 | 0
 873 | 0
 874 | 1
 875 | 0
 876 | 1
 877 | 1
 878 | 0
 879 | 1
 880 | 0
 881 | 1
 882 | 1
 883 | 0
 884 | 1
 885 | 1
 886 | 1
 887 | 1
 888 | 1
 889 | 0
 890 | 0
 891 | 1
 892 | 1
 893 | 1
 894 | 0
 895 | 1
 896 | 0
 897 | 1
 898 | 0
 899 | 0
 900 | 1
 901 | 0
 902 | 0
 903 | 1
 904 | 1
 905 | 1
 906 | 1
 907 | 1
 908 | 0
 909 | 0
 910 | 1
 911 | 0
 912 | 1
 913 | 1
 914 | 0
 915 | 1
 916 | 0
 917 | 0
 918 | 1
 919 | 1
 920 | 1
 921 | 1
 922 | 0
 923 | 0
 924 | 0
 925 | 1
 926 | 0
 927 | 0
 928 | 1
 929 | 1
 930 | 0
 931 | 1
 932 | 0
 933 | 1
 934 | 0
 935 | 1
 936 | 0
 937 | 0
 938 | 1
 939 | 1
 940 | 1
 941 | 0
 942 | 0
 943 | 1
 944 | 0
 945 | 0
 946 | 1
 947 | 0
 948 | 1
 949 | 1
 950 | 0
 951 | 1
 952 | 0
 953 | 0
 954 | 1
 955 | 0
 956 | 0
 957 | 1
 958 | 1
 959 | 0
 960 | 1
 961 | 0
 962 | 1
 963 | 1
 964 | 1
 965 | 1
 966 | 1
 967 | 0
 968 | 1
 969 | 0
 970 | 0
 971 | 0
 972 | 1
 973 | 0
 974 | 1
 975 | 1
 976 | 1
 977 | 1
 978 | 1
 979 | 1
 980 | 1
 981 | 1
 982 | 1
 983 | 0
 984 | 1
 985 | 0
 986 | 1
 987 | 1
 988 | 1
 989 | 0
 990 | 0
 991 | 0
 992 | 1
 993 | 0
 994 | 0
 995 | 1
 996 | 0
 997 | 0
 998 | 1
 999 | 1
1000 | 0
1001 | 0
1002 | 1
1003 | 1
1004 | 0
1005 | 1
1006 | 1
1007 | 0
1008 | 0
1009 | 0
1010 | 0
1011 | 0
1012 | 0
1013 | 0
1014 | 1
1015 | 0
1016 | 0
1017 | 0
1018 | 0
1019 | 1
1020 | 1
1021 | 1
1022 | 1
1023 | 0
1024 | 1
1025 | 0
1026 | 0
1027 | 0
1028 | 1
1029 | 0
1030 | 0
1031 | 1
1032 | 1
1033 | 1
1034 | 0
1035 | 1
1036 | 0
1037 | 1
1038 | 0
1039 | 1
1040 | 0
1041 | 0
1042 | 0
1043 | 0
1044 | 0
1045 | 0
1046 | 0
1047 | 0
1048 | 1
1049 | 1
1050 | 0
1051 | 0
1052 | 0
1053 | 1
1054 | 1
1055 | 0
1056 | 0
1057 | 0
1058 | 1
1059 | 1
1060 | 1
1061 | 1
1062 | 0
1063 | 0
1064 | 0
1065 | 0
1066 | 1
1067 | 1
1068 | 1
1069 | 1
1070 | 1
1071 | 0
1072 | 1
1073 | 1
1074 | 1
1075 | 1
1076 | 1
1077 | 1
1078 | 0
1079 | 1
1080 | 1
1081 | 1
1082 | 1
1083 | 1
1084 | 0
1085 | 1
1086 | 0
1087 | 1
1088 | 1
1089 | 0
1090 | 1
1091 | 0
1092 | 1
1093 | 0
1094 | 1
1095 | 1
1096 | 1
1097 | 1
1098 | 1
1099 | 0
1100 | 0
1101 | 0
1102 | 0
1103 | 0
1104 | 0
1105 | 0
1106 | 0
1107 | 0
1108 | 1
1109 | 1
1110 | 1
1111 | 0
1112 | 0
1113 | 1
1114 | 1
1115 | 0
1116 | 0
1117 | 0
1118 | 0
1119 | 0
1120 | 0
1121 | 1
1122 | 1
1123 | 0
1124 | 1
1125 | 1
1126 | 1
1127 | 0
1128 | 0
1129 | 0
1130 | 0
1131 | 0
1132 | 0
1133 | 0
1134 | 0
1135 | 1
1136 | 0
1137 | 1
1138 | 1
1139 | 1
1140 | 1
1141 | 0
1142 | 0
1143 | 0
1144 | 0
1145 | 0
1146 | 0
1147 | 1
1148 | 0
1149 | 1
1150 | 0
1151 | 0
1152 | 1
1153 | 0
1154 | 0
1155 | 1
1156 | 1
1157 | 1
1158 | 0
1159 | 0
1160 | 1
1161 | 0
1162 | 1
1163 | 0
1164 | 1
1165 | 1
1166 | 0
1167 | 0
1168 | 1
1169 | 1
1170 | 1
1171 | 0
1172 | 0
1173 | 1
1174 | 0
1175 | 1
1176 | 1
1177 | 1
1178 | 0
1179 | 1
1180 | 1
1181 | 0
1182 | 0
1183 | 1
1184 | 0
1185 | 0
1186 | 1
1187 | 0
1188 | 1
1189 | 0
1190 | 1
1191 | 0
1192 | 1
1193 | 1
1194 | 0
1195 | 1
1196 | 0
1197 | 0
1198 | 1
1199 | 1
1200 | 0
1201 | 0
1202 | 0
1203 | 1
1204 | 0
1205 | 0
1206 | 1
1207 | 1
1208 | 1
1209 | 0
1210 | 1
1211 | 0
1212 | 1
1213 | 1
1214 | 1
1215 | 0
1216 | 0
1217 | 0
1218 | 0
1219 | 0
1220 | 1
1221 | 1
1222 | 0
1223 | 1
1224 | 0
1225 | 0
1226 | 1
1227 | 0
1228 | 1
1229 | 0
1230 | 0
1231 | 1
1232 | 0
1233 | 0
1234 | 0
1235 | 0
1236 | 0
1237 | 1
1238 | 0
1239 | 0
1240 | 0
1241 | 1
1242 | 1
1243 | 0
1244 | 0
1245 | 1
1246 | 0
1247 | 1
1248 | 1
1249 | 1
1250 | 0
1251 | 0
1252 | 0
1253 | 0
1254 | 0
1255 | 0
1256 | 1
1257 | 0
1258 | 0
1259 | 0
1260 | 1
1261 | 0
1262 | 0
1263 | 1
1264 | 0
1265 | 1
1266 | 0
1267 | 1
1268 | 0
1269 | 0
1270 | 0
1271 | 0
1272 | 1
1273 | 1
1274 | 0
1275 | 1
1276 | 1
1277 | 0
1278 | 1
1279 | 1
1280 | 1
1281 | 1
1282 | 0
1283 | 0
1284 | 1
1285 | 0
1286 | 1
1287 | 1
1288 | 0
1289 | 0
1290 | 1
1291 | 0
1292 | 0
1293 | 0
1294 | 1
1295 | 0
1296 | 0
1297 | 0
1298 | 0
1299 | 0
1300 | 0
1301 | 0
1302 | 0
1303 | 1
1304 | 1
1305 | 1
1306 | 1
1307 | 1
1308 | 0
1309 | 1
1310 | 0
1311 | 1
1312 | 1
1313 | 1
1314 | 0
1315 | 1
1316 | 1
1317 | 0
1318 | 1
1319 | 0
1320 | 1
1321 | 0
1322 | 1
1323 | 1
1324 | 0
1325 | 0
1326 | 0
1327 | 0
1328 | 0
1329 | 1
1330 | 1
1331 | 0
1332 | 1
1333 | 1
1334 | 1
1335 | 0
1336 | 0
1337 | 0
1338 | 1
1339 | 0
1340 | 1
1341 | 1
1342 | 0
1343 | 1
1344 | 0
1345 | 0
1346 | 1
1347 | 0
1348 | 0
1349 | 0
1350 | 0
1351 | 0
1352 | 1
1353 | 0
1354 | 0
1355 | 0
1356 | 0
1357 | 0
1358 | 0
1359 | 0
1360 | 0
1361 | 1
1362 | 1
1363 | 0
1364 | 0
1365 | 0
1366 | 0
1367 | 0
1368 | 1
1369 | 0
1370 | 1
1371 | 0
1372 | 0
1373 | 0
1374 | 1
1375 | 1
1376 | 0
1377 | 0
1378 | 1
1379 | 0
1380 | 1
1381 | 0
1382 | 1
1383 | 1
1384 | 1
1385 | 1
1386 | 0
1387 | 0
1388 | 0
1389 | 0
1390 | 1
1391 | 1
1392 | 1
1393 | 1
1394 | 0
1395 | 1
1396 | 1
1397 | 1
1398 | 1
1399 | 0
1400 | 0
1401 | 1
1402 | 0
1403 | 1
1404 | 1
1405 | 1
1406 | 0
1407 | 0
1408 | 0
1409 | 1
1410 | 1
1411 | 0
1412 | 1
1413 | 1
1414 | 1
1415 | 0
1416 | 1
1417 | 0
1418 | 1
1419 | 1
1420 | 0
1421 | 1
1422 | 1
1423 | 0
1424 | 1
1425 | 1
1426 | 1
1427 | 0
1428 | 0
1429 | 0
1430 | 1
1431 | 0
1432 | 0
1433 | 0
1434 | 1
1435 | 0
1436 | 1
1437 | 0
1438 | 0
1439 | 1
1440 | 0
1441 | 0
1442 | 0
1443 | 0
1444 | 0
1445 | 1
1446 | 0
1447 | 1
1448 | 1
1449 | 0
1450 | 1
1451 | 0
1452 | 0
1453 | 0
1454 | 0
1455 | 0
1456 | 1
1457 | 0
1458 | 1
1459 | 0
1460 | 1
1461 | 1
1462 | 0
1463 | 1
1464 | 0
1465 | 1
1466 | 0
1467 | 0
1468 | 1
1469 | 0
1470 | 0
1471 | 0
1472 | 1
1473 | 1
1474 | 1
1475 | 1
1476 | 0
1477 | 1
1478 | 0
1479 | 0
1480 | 1
1481 | 0
1482 | 0
1483 | 1
1484 | 0
1485 | 1
1486 | 0
1487 | 0
1488 | 0
1489 | 1
1490 | 1
1491 | 1
1492 | 0
1493 | 1
1494 | 1
1495 | 0
1496 | 0
1497 | 0
1498 | 1
1499 | 0
1500 | 1
1501 | 1
1502 | 0
1503 | 1
1504 | 0
1505 | 0
1506 | 1
1507 | 0
1508 | 0
1509 | 1
1510 | 1
1511 | 1
1512 | 0
1513 | 0
1514 | 1
1515 | 0
1516 | 1
1517 | 0
1518 | 1
1519 | 0
1520 | 1
1521 | 0
1522 | 0
1523 | 0
1524 | 0
1525 | 1
1526 | 1
1527 | 0
1528 | 1
1529 | 1
1530 | 0
1531 | 0
1532 | 0
1533 | 0
1534 | 1
1535 | 1
1536 | 0
1537 | 1
1538 | 0
1539 | 0
1540 | 0
1541 | 0
1542 | 1
1543 | 0
1544 | 0
1545 | 0
1546 | 1
1547 | 0
1548 | 0
1549 | 1
1550 | 0
1551 | 0
1552 | 1
1553 | 0
1554 | 1
1555 | 0
1556 | 0
1557 | 1
1558 | 1
1559 | 1
1560 | 0
1561 | 0
1562 | 0
1563 | 1
1564 | 0
1565 | 0
1566 | 1
1567 | 1
1568 | 0
1569 | 0
1570 | 1
1571 | 1
1572 | 0
1573 | 1
1574 | 0
1575 | 1
1576 | 1
1577 | 1
1578 | 0
1579 | 1
1580 | 0
1581 | 1
1582 | 0
1583 | 1
1584 | 1
1585 | 1
1586 | 0
1587 | 0
1588 | 1
1589 | 1
1590 | 1
1591 | 0
1592 | 0
1593 | 0
1594 | 0
1595 | 1
1596 | 0
1597 | 0
1598 | 0
1599 | 1
1600 | 0
1601 | 


--------------------------------------------------------------------------------
/data/mr/MR.task.train.sentences:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/mr/MR.task.train.sentences


--------------------------------------------------------------------------------
/data/mr/MR.task.unlabel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/data/mr/MR.task.unlabel


--------------------------------------------------------------------------------
/evaluate_batch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding:utf8
  3 | 
  4 | import argparse
  5 | import sys
  6 | import time
  7 | 
  8 | import torch
  9 | 
 10 | sys.path.append('../')
 11 | import vectorize
 12 | import yutils
 13 | import preprocessing
 14 | 
 15 | 
 16 | def classify_batch(args, model, targets, targets_seqlen, targets_mask, tweets, tweets_seqlen, tweets_mask):
 17 |     """
 18 |         Predict a single batch
 19 |         return predictions & max_att_weight
 20 |     """
 21 |     assert len(tweets) == len(targets)
 22 | 
 23 |     model.eval()
 24 |     ''' Prepare data and prediction'''
 25 |     batch_size = len(targets)
 26 |     from main_batch import var_batch
 27 |     targets_, targets_seqlen_, targets_mask_, tweets_, tweets_seqlen_, tweets_mask_ = \
 28 |         var_batch(args, batch_size,
 29 |                   targets, targets_seqlen, targets_mask,
 30 |                   tweets, tweets_seqlen, tweets_mask)
 31 | 
 32 |     probs, _ = model((tweets_, tweets_seqlen_, tweets_mask_),
 33 |                      (targets_, targets_seqlen_, targets_mask_))
 34 | 
 35 |     pred_weight, pred = torch.max(probs, dim=1)
 36 | 
 37 |     if args.cuda:
 38 |         pred = pred.view(-1).cpu().data.numpy()
 39 |         pred_weights = pred_weight.view(-1).cpu().data.numpy()
 40 |     else:
 41 |         pred = pred.view(-1).data.numpy()
 42 |         pred_weights = pred_weight.view(-1).data.numpy()
 43 | 
 44 |     return pred, pred_weights
 45 | 
 46 | 
 47 | def evaluate(args, model, word2idx, seged_tweets, seged_targets):
 48 |     """
 49 |     Input:
 50 |         1. list of segmented  tweets
 51 |         2. list of segmented targets
 52 |     Output:
 53 |         1.list of Stance labels for tweets towards targets
 54 | 
 55 |     Procedure:
 56 |         - 分词结果的向量化 （用utils中的函数Word2Vec , **或者直接用JSON文件中的word2idx+embeddings）
 57 |         - 分词后句子的seq_len,  mask_matrix的计算
 58 |         - 根据是否使用GPU，Variable所有参数
 59 |         - 计算并返回分类结果
 60 | 
 61 |     :param seged_tweets:
 62 |     :param seged_targets:
 63 |     :param word2idx:
 64 |     :param args:
 65 |     :param model:
 66 |     :return:
 67 |     """
 68 | 
 69 |     ''' sentences  to lists_of_word_index '''
 70 |     tic = time.time()
 71 |     tweets = vectorize.sentence_to_index(word2idx, seged_tweets)
 72 |     targets = vectorize.sentence_to_index(word2idx, seged_targets)
 73 |     ''' seq_lens and mask_matrix for each sentence  '''
 74 |     tweets,  tweets_seqlen = yutils.get_padding(tweets, max_len=args.ans_max_len)
 75 |     tweets_mask = yutils.get_mask_matrix(tweets_seqlen, max_len=args.ans_max_len)
 76 |     targets, targets_seqlen = yutils.get_padding(targets, max_len=args.ask_max_len)
 77 |     targets_mask = yutils.get_mask_matrix(targets_seqlen, max_len=args.ask_max_len)
 78 |     assert len(tweets) == len(targets)
 79 |     # print tweets[0], tweets_seqlen[0], tweets_mask[0]
 80 | 
 81 |     print "--------------------"
 82 |     '''  Variable all parameters '''
 83 |     ''' 1. decide batch_size, batch_num '''
 84 |     total = len(tweets)
 85 |     bs = 1000  # batch_size
 86 |     bn = int(total / bs)  # batch_num
 87 |     left = total - bs * bn
 88 | 
 89 |     ''' 2. classify each batch and combine the predictions, a for loop '''
 90 |     pred = []
 91 |     pred_weights = []
 92 |     # batch_size, batch_num
 93 |     for b in range(bn):
 94 |         pred_batch, pred_weight_batch = classify_batch(args, model,
 95 |                                                        targets[b * bs:(b + 1) * bs],
 96 |                                                        targets_seqlen[b * bs:(b + 1) * bs],
 97 |                                                        targets_mask[b * bs:(b + 1) * bs],
 98 |                                                        tweets[b * bs:(b + 1) * bs],
 99 |                                                        tweets_seqlen[b * bs:(b + 1) * bs],
100 |                                                        tweets_mask[b * bs:(b + 1) * bs])
101 |         pred.extend(pred_batch)
102 |         pred_weights.extend(pred_weight_batch)
103 |     if left > 0:
104 |         pred_batch, pred_weight_batch = classify_batch(args, model,
105 |                                                        targets[bs * bn:],
106 |                                                        targets_seqlen[bs * bn:],
107 |                                                        targets_mask[bs * bn:],
108 |                                                        tweets[bs * bn:],
109 |                                                        tweets_seqlen[bs * bn:],
110 |                                                        tweets_mask[bs * bn:])
111 |         pred.extend(pred_batch)
112 |         pred_weights.extend(pred_weight_batch)
113 |     tit = time.time() - tic
114 |     print "  Predicting {:d} examples using {:5.4f} seconds".format(total, tit)
115 | 
116 |     ''' Adjust weights here !!!!!!!!!!!!!!!!!!!!!!'''
117 |     # utils.write_list2file(pred_weights, "../data/evaluate/out_predictions_weights.txt")
118 | 
119 |     return pred, pred_weights
120 | 
121 | 
122 | def example_main(args, model, word2idx):
123 |     print "Begin to classify QA pairs "
124 |     """ Load and segment raw  tweets|targets files """
125 |     tweets = yutils.read_file2list(args.input + "/processed/seged/a_test_tweets.txt")
126 |     targets = yutils.read_file2list(args.input + "/processed/seged/a_test_targets.txt")
127 |     seged_tweets = yutils.seg_sentence(tweets, choice="list", place="hpc")  # may use lexicon here
128 |     seged_targets = yutils.seg_sentence(targets, choice="list", place="hpc")
129 |     predictions, pred_weights = evaluate(args, model, word2idx, seged_tweets, seged_targets)
130 | 
131 |     # for calculating 1w results
132 |     yutils.write_list2file(predictions, "out_predictions.txt")
133 |     yutils.write_list2file(pred_weights, "out_predictions_weights.txt")
134 | 
135 |     preprocessing.write_stance_txt(args.input + "SemEval2016-Task6-subtaskA-testdata.txt",
136 |                                      "out_predictions.txt",
137 |                                      "z_result/SemEval2016-Task6-subtaskA-testdata-pred.txt")
138 | 
139 | 
140 | def example_single(args, model, word2idx):
141 |     """ Load and segment <target, tweet> pair in the command line """
142 |     while True:
143 |         target = raw_input("问题: ")
144 |         tweet = raw_input("回答: ")
145 |         targets = [str(target)]
146 |         tweets = [str(tweet)]
147 |         seged_tweets = yutils.seg_sentence(tweets, choice="list", place="hpc")  # may use lexicon here
148 |         seged_targets = yutils.seg_sentence(targets, choice="list", place="hpc")
149 |         predictions = evaluate(args, model, word2idx, seged_tweets, seged_targets)
150 |         print "预测结果: ", predictions
151 | 
152 | 
153 | def savez_model(model, model_name="np_AoABatchWinGRU_100_6025_batch8.npz"):
154 |     state = model.state_dict()
155 |     # output.bias [-0.09973772 0.09077224 0.00347146]
156 |     print type(state), len(state), dir(state)
157 |     print state.items()[-1], type(state.items()[-1])
158 |     print state.items()[-1][0], state.items()[-1][1].cpu().numpy()
159 |     import numpy as np
160 |     new_state = dict()
161 |     for item in state.items():
162 |         new_state[item[0]] = item[1].cpu().numpy()
163 |     np.savez(model_name, **new_state)
164 |     state = np.load("aoa.npz")
165 |     print state.files
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     parser = argparse.ArgumentParser(description="PyTorch AoA for Sogou Project")
170 | 
171 |     ''' load data and save model'''
172 |     parser.add_argument("--input", type=str, default="../data/semeval2016/",
173 |                         help="location of dataset")
174 |     parser.add_argument("--word2idx", type=str, default="../data/semeval2016/task_a/word2idx_glove.pkl",
175 |                         help="location of word2idx dictionary")
176 |     parser.add_argument("--save", type=str, default="../saved_model/AoABatch/",
177 |                         help="path to save the model")
178 |     parser.add_argument("--target", type=str, default="",
179 |                         help="which target to classify")
180 | 
181 |     parser.add_argument("--seed", type=int, default=123456,
182 |                         help="random seed for reproduction")
183 |     parser.add_argument("--cuda", action="store_true",
184 |                         help="use CUDA")
185 | 
186 |     ''' test purpose'''
187 |     parser.add_argument("--ans_max_len", type=int, default=25,
188 |                         help="max time step of tweet sequence")
189 |     parser.add_argument("--ask_max_len", type=int, default=6,
190 |                         help="max time step of target sequence")
191 | 
192 |     example_args = parser.parse_args()
193 | 
194 |     ''' Load Segmentor '''
195 |     example_segmentor = yutils.load_segmentor(place="hpc")
196 | 
197 |     ''' Load model '''
198 |     with open(example_args.save + "/model.pt") as f:
199 |         if example_args.cuda:
200 |             example_model = torch.load(f)
201 |         else:
202 |             example_model = torch.load(f, map_location=lambda storage, loc: storage)
203 |             example_model.cpu()
204 |     example_model.eval()
205 |     '''     Load word2idx only once '''
206 |     example_word2idx = yutils.pickle2dict(example_args.word2idx)
207 | 
208 |     example_main(example_args, example_model, example_word2idx)
209 |     # example_single(example_args, example_model, example_word2idx)
210 | 
211 |     ''' TO numpy npz '''
212 |     # savez_model(example_model)
213 | 
214 |     # while True:
215 |     #     '''
216 |     #         1. segment sentences
217 |     #         2. vectorize sentence, do padding and masks
218 |     #         3. classify the paris and get predictions
219 |     #     '''
220 |     #     list_of_qa_pairs = []
221 |     #     updated_qa_pairs = stance_classifier(example_segmentor, example_model, list_of_qa_pairs)
222 |     #
223 | 
224 | 
225 | 
226 | 
227 | 


--------------------------------------------------------------------------------
/main_batch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding:utf8
  3 | 
  4 | import argparse
  5 | import os
  6 | import time
  7 | from progress.bar import Bar
  8 | import yutils
  9 | 
 10 | import numpy
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.optim as optim
 14 | 
 15 | from torch.autograd import Variable
 16 | 
 17 | 
 18 | from nnet.blstm import BLSTM
 19 | from nnet.lstm import LSTM
 20 | from nnet.cnn import CNN
 21 | 
 22 | torch.manual_seed(123456)
 23 | 
 24 | 
 25 | def test_prf(pred, labels):
 26 |     """
 27 |     4. log and return prf scores
 28 |     :return:
 29 |     """
 30 |     total = len(labels)
 31 |     pred_right = [0, 0]
 32 |     pred_all = [0, 0]
 33 |     gold = [0, 0]
 34 |     for i in range(total):
 35 |         pred_all[pred[i]] += 1
 36 |         if pred[i] == labels[i]:
 37 |             pred_right[pred[i]] += 1
 38 |         gold[labels[i]] += 1
 39 | 
 40 |     print "  Prediction:", pred_all, " Right:", pred_right, " Gold:", gold
 41 |     ''' -- for all labels -- '''
 42 |     print "  ****** Neg|Neu|Pos ******"
 43 |     accuracy = 1.0 * sum(pred_right) / total
 44 |     p, r, f1 = yutils.cal_prf(pred_all, pred_right, gold, formation=False)
 45 |     _, _, macro_f1 = yutils.cal_prf(pred_all, pred_right, gold,
 46 |                                     formation=False,
 47 |                                     metric_type="macro")
 48 |     print "    Accuracy on test is %d/%d = %f" % (sum(pred_right), total, accuracy)
 49 |     print "    Precision: %s\n    Recall   : %s\n    F1 score : %s\n    Macro F1 score on test (Neg|Neu|Pos) is %f" \
 50 |           % (p, r, f1, macro_f1)
 51 | 
 52 |     return accuracy
 53 | 
 54 | 
 55 | def test(model, dataset, args, data_part="test"):
 56 |     """
 57 | 
 58 |     :param model:
 59 |     :param args:
 60 |     :param dataset:
 61 |     :param data_part:
 62 |     :return:
 63 |     """
 64 | 
 65 |     tvt_set = dataset[data_part]
 66 |     tvt_set = yutils.YDataset(tvt_set["xIndexes"],
 67 |                               tvt_set["yLabels"],
 68 |                               to_pad=True, max_len=args.sen_max_len)
 69 | 
 70 |     test_set = tvt_set
 71 |     sentences, sentences_seqlen, sentences_mask, labels = test_set.next_batch(len(test_set))
 72 | 
 73 |     assert len(test_set) == len(sentences) == len(labels)
 74 | 
 75 |     tic = time.time()
 76 | 
 77 |     model.eval()
 78 |     ''' Prepare data and prediction'''
 79 |     batch_size = len(sentences)
 80 |     sentences_, sentences_seqlen_, sentences_mask_ = \
 81 |         var_batch(args, batch_size, sentences, sentences_seqlen, sentences_mask)
 82 | 
 83 |     probs = model(sentences_, sentences_seqlen_, sentences_mask_)
 84 | 
 85 |     _, pred = torch.max(probs, dim=1)
 86 | 
 87 |     if args.cuda:
 88 |         pred = pred.view(-1).cpu().data.numpy()
 89 |     else:
 90 |         pred = pred.view(-1).data.numpy()
 91 | 
 92 |     tit = time.time() - tic
 93 |     print "  Predicting {:d} examples using {:5.4f} seconds".format(len(test_set), tit)
 94 | 
 95 |     labels = numpy.asarray(labels)
 96 |     ''' log and return prf scores '''
 97 |     accuracy = test_prf(pred, labels)
 98 | 
 99 |     return accuracy
100 | 
101 | 
102 | def var_batch(args, batch_size, sentences, sentences_seqlen, sentences_mask):
103 |     """
104 |     Transform the input batch to PyTorch variables
105 |     :return:
106 |     """
107 |     # dtype = torch.from_numpy(sentences, dtype=torch.cuda.LongTensor)
108 |     sentences_ = Variable(torch.LongTensor(sentences).view(batch_size, args.sen_max_len))
109 |     sentences_seqlen_ = Variable(torch.LongTensor(sentences_seqlen).view(batch_size, 1))
110 |     sentences_mask_ = Variable(torch.LongTensor(sentences_mask).view(batch_size, args.sen_max_len))
111 | 
112 |     if args.cuda:
113 |         sentences_ = sentences_.cuda()
114 |         sentences_seqlen_ = sentences_seqlen_.cuda()
115 |         sentences_mask_ = sentences_mask_.cuda()
116 | 
117 |     return sentences_, sentences_seqlen_, sentences_mask_
118 | 
119 | 
120 | def train(model, training_data, args, optimizer, criterion):
121 |     model.train()
122 | 
123 |     batch_size = args.batch_size
124 | 
125 |     sentences, sentences_seqlen, sentences_mask, labels = training_data
126 | 
127 |     # print batch_size, len(sentences), len(labels)
128 | 
129 |     assert batch_size == len(sentences) == len(labels)
130 | 
131 |     ''' Prepare data and prediction'''
132 |     sentences_, sentences_seqlen_, sentences_mask_ = \
133 |         var_batch(args, batch_size, sentences, sentences_seqlen, sentences_mask)
134 |     labels_ = Variable(torch.LongTensor(labels))
135 |     if args.cuda:
136 |         labels_ = labels_.cuda()
137 | 
138 |     assert len(sentences) == len(labels)
139 | 
140 |     model.zero_grad()
141 |     probs = model(sentences_, sentences_seqlen_, sentences_mask_)
142 |     loss = criterion(probs.view(len(labels_), -1), labels_)
143 | 
144 |     loss.backward()
145 |     optimizer.step()
146 | 
147 | 
148 | def main(args):
149 |     # define location to save the model
150 |     if args.save == "__":
151 |         # LSTM_100_40_8
152 |         args.save = "saved_model/%s_%d_%d_%d" % \
153 |                     (args.model, args.nhid, args.sen_max_len, args.batch_size)
154 | 
155 |     in_dir = "data/mr/"
156 |     dataset = yutils.pickle2dict(in_dir + "features_glove.pkl")
157 | 
158 |     if args.is_test:
159 |         with open(args.save + "/model.pt") as f:
160 |             model = torch.load(f)
161 |         test(model, dataset, args)
162 | 
163 |     else:
164 |         ''' make sure the folder to save models exist '''
165 |         if not os.path.exists(args.save):
166 |             os.mkdir(args.save)
167 | 
168 |         embeddings = yutils.pickle2dict(in_dir + "embeddings_glove.pkl")
169 |         dataset["embeddings"] = embeddings
170 |         emb_np = numpy.asarray(embeddings, dtype=numpy.float32)  # from_numpy
171 |         emb = torch.from_numpy(emb_np)
172 | 
173 |         models = {"LSTM": LSTM, "BLSTM": BLSTM, "CNN": CNN}
174 |         model = models[args.model](embeddings=emb,
175 |                                    input_dim=args.embsize,
176 |                                    hidden_dim=args.nhid,
177 |                                    num_layers=args.nlayers,
178 |                                    output_dim=2,
179 |                                    max_len=args.sen_max_len,
180 |                                    dropout=args.dropout)
181 | 
182 |         if torch.cuda.is_available():
183 |             if not args.cuda:
184 |                 print "Waring: You have a CUDA device, so you should probably run with --cuda"
185 |             else:
186 |                 torch.cuda.manual_seed(args.seed)
187 |                 model.cuda()
188 | 
189 |         optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=1e-5)
190 |         criterion = nn.CrossEntropyLoss()
191 | 
192 |         training_set = dataset["training"]
193 |         training_set = yutils.YDataset(training_set["xIndexes"],
194 |                                        training_set["yLabels"],
195 |                                        to_pad=True,
196 |                                        max_len=args.sen_max_len)
197 | 
198 |         best_acc_test, best_acc_valid = -numpy.inf, -numpy.inf
199 |         batches_per_epoch = int(len(training_set)/args.batch_size)
200 |         print "--------------\nEpoch 0 begins!"
201 |         max_train_steps = int(args.epochs * batches_per_epoch * 10)
202 |         bar = Bar("  Processing", max=max_train_steps)
203 |         tic = time.time()
204 |         print "-----------------------------", max_train_steps, len(training_set), args.batch_size
205 | 
206 |         for step in xrange(max_train_steps):
207 |             bar.next()
208 |             training_batch = training_set.next_batch(args.batch_size)
209 | 
210 |             train(model, training_batch, args, optimizer, criterion)
211 | 
212 |             if (step+1) % batches_per_epoch == 0:
213 |                 print "  using %.5f seconds" % (time.time() - tic)
214 |                 tic = time.time()
215 |                 ''' Test after each epoch '''
216 |                 print "\n  Begin to predict the results on Validation"
217 |                 acc_score = test(model, dataset, args, data_part="validation")
218 | 
219 |                 print "  ----Old best acc score on validation is %f" % best_acc_valid
220 |                 if acc_score > best_acc_valid:
221 |                     print "  ----New acc score on validation is %f" % acc_score
222 |                     best_acc_valid = acc_score
223 |                     with open(args.save + "/model.pt", 'wb') as to_save:
224 |                         torch.save(model, to_save)
225 | 
226 |                     acc_test = test(model, dataset, args)
227 |                     print "  ----Old best acc score on test is %f" % best_acc_test
228 |                     if acc_test > best_acc_test:
229 |                         best_acc_test = acc_test
230 |                         print "  ----New acc score on test is %f" % acc_test
231 | 
232 |                 print "--------------\nEpoch %d begins!" % (training_set.epochs_completed + 1)
233 | 
234 |         # print the final result
235 |         with open(args.save + "/model.pt") as f:
236 |             model = torch.load(f)
237 |         test(model, dataset, args)
238 |         bar.finish()
239 | 
240 | 
241 | if __name__ == "__main__":
242 |     parser = argparse.ArgumentParser(description="PyTorch AoA for Stance Project")
243 | 
244 |     ''' load data and save model'''
245 |     parser.add_argument("--save", type=str, default="__",
246 |                         help="path to save the model")
247 | 
248 |     ''' model parameters '''
249 |     parser.add_argument("--model", type=str, default="BLSTM",
250 |                         help="type of model to use for Stance Project")
251 |     parser.add_argument("--embsize", type=int, default=100,
252 |                         help="size of word embeddings")
253 |     parser.add_argument("--emb", type=str, default="glove",
254 |                         help="type of word embeddings")
255 |     parser.add_argument("--nhid", type=int, default=50,
256 |                         help="size of RNN hidden layer")
257 |     parser.add_argument("--nlayers", type=int, default=1,
258 |                         help="number of layers of LSTM")
259 |     parser.add_argument("--lr", type=float, default=0.01,
260 |                         help="learning rate")
261 |     parser.add_argument("--epochs", type=int, default=100,
262 |                         help="number of training epoch")
263 |     parser.add_argument("--batch_size", type=int, default=8,
264 |                         help="batch size")
265 |     parser.add_argument("--dropout", type=float, default=0.1,
266 |                         help="dropout rate")
267 |     parser.add_argument("--seed", type=int, default=123456,
268 |                         help="random seed for reproduction")
269 |     parser.add_argument("--cuda", action="store_true",
270 |                         help="use CUDA")
271 | 
272 |     parser.add_argument("--sen_max_len", type=int, default=40,
273 |                         help="max time step of tweet sequence")
274 |     ''' test purpose'''
275 |     parser.add_argument("--is_test", action="store_true",
276 |                         help="flag for training model or only test")
277 | 
278 |     my_args = parser.parse_args()
279 | 
280 |     torch.manual_seed(my_args.seed)
281 | 
282 |     main(my_args)
283 | 


--------------------------------------------------------------------------------
/nnet/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/nnet/.DS_Store


--------------------------------------------------------------------------------
/nnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/albertwy/BiLSTM/78153783d8f4eae6c193607dca9f482b9a04672a/nnet/__init__.py


--------------------------------------------------------------------------------
/nnet/blstm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding:utf8
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | torch.manual_seed(123456)
10 | 
11 | 
12 | class BLSTM(nn.Module):
13 |     """
14 |         Implementation of BLSTM Concatenation for sentiment classification task
15 |     """
16 | 
17 |     def __init__(self, embeddings, input_dim, hidden_dim, num_layers, output_dim, max_len=40, dropout=0.5):
18 |         super(BLSTM, self).__init__()
19 | 
20 |         self.emb = nn.Embedding(num_embeddings=embeddings.size(0),
21 |                                 embedding_dim=embeddings.size(1),
22 |                                 padding_idx=0)
23 |         self.emb.weight = nn.Parameter(embeddings)
24 | 
25 |         self.input_dim = input_dim
26 |         self.hidden_dim = hidden_dim
27 |         self.output_dim = output_dim
28 | 
29 |         # sen encoder
30 |         self.sen_len = max_len
31 |         self.sen_rnn = nn.LSTM(input_size=input_dim,
32 |                                hidden_size=hidden_dim,
33 |                                num_layers=num_layers,
34 |                                dropout=dropout,
35 |                                batch_first=True,
36 |                                bidirectional=True)
37 | 
38 |         self.output = nn.Linear(2 * self.hidden_dim, output_dim)
39 | 
40 |     def bi_fetch(self, rnn_outs, seq_lengths, batch_size, max_len):
41 |         rnn_outs = rnn_outs.view(batch_size, max_len, 2, -1)
42 | 
43 |         # (batch_size, max_len, 1, -1)
44 |         fw_out = torch.index_select(rnn_outs, 2, Variable(torch.LongTensor([0])).cuda())
45 |         fw_out = fw_out.view(batch_size * max_len, -1)
46 |         bw_out = torch.index_select(rnn_outs, 2, Variable(torch.LongTensor([1])).cuda())
47 |         bw_out = bw_out.view(batch_size * max_len, -1)
48 | 
49 |         batch_range = Variable(torch.LongTensor(range(batch_size))).cuda() * max_len
50 |         batch_zeros = Variable(torch.zeros(batch_size).long()).cuda()
51 | 
52 |         fw_index = batch_range + seq_lengths.view(batch_size) - 1
53 |         fw_out = torch.index_select(fw_out, 0, fw_index)  # (batch_size, hid)
54 | 
55 |         bw_index = batch_range + batch_zeros
56 |         bw_out = torch.index_select(bw_out, 0, bw_index)
57 | 
58 |         outs = torch.cat([fw_out, bw_out], dim=1)
59 |         return outs
60 | 
61 |     def forward(self, sen_batch, sen_lengths, sen_mask_matrix):
62 |         """
63 | 
64 |         :param sen_batch: (batch, sen_length), tensor for sentence sequence
65 |         :param sen_lengths:
66 |         :param sen_mask_matrix:
67 |         :return:
68 |         """
69 | 
70 |         ''' Embedding Layer | Padding | Sequence_length 40'''
71 |         sen_batch = self.emb(sen_batch)
72 | 
73 |         batch_size = len(sen_batch)
74 | 
75 |         ''' Bi-LSTM Computation '''
76 |         sen_outs, _ = self.sen_rnn(sen_batch.view(batch_size, -1, self.input_dim))
77 |         sen_rnn = sen_outs.contiguous().view(batch_size, -1, 2 * self.hidden_dim)  # (batch, sen_len, 2*hid)
78 | 
79 |         ''' Fetch the truly last hidden layer of both sides
80 |         '''
81 |         sentence_batch = self.bi_fetch(sen_rnn, sen_lengths, batch_size, self.sen_len)  # (batch_size, 2*hid)
82 | 
83 |         representation = sentence_batch
84 |         out = self.output(representation)
85 |         out_prob = F.softmax(out.view(batch_size, -1))
86 | 
87 |         return out_prob
88 | 


--------------------------------------------------------------------------------
/nnet/lstm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding:utf8
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | torch.manual_seed(123456)
10 | 
11 | 
12 | class LSTM(nn.Module):
13 |     """
14 |         Implementation of BLSTM Concatenation for Stance Classification Task
15 |         Final representation is concatenation of last hidden layer of both sentence and ask blstm
16 |     """
17 | 
18 |     def __init__(self, embeddings, input_dim, hidden_dim, num_layers, output_dim, max_len=40, dropout=0.5):
19 |         super(LSTM, self).__init__()
20 | 
21 |         self.emb = nn.Embedding(num_embeddings=embeddings.size(0),
22 |                                 embedding_dim=embeddings.size(1),
23 |                                 padding_idx=0)
24 |         self.emb.weight = nn.Parameter(embeddings)
25 | 
26 |         self.input_dim = input_dim
27 |         self.hidden_dim = hidden_dim
28 |         self.output_dim = output_dim
29 | 
30 |         # sen encoder
31 |         self.sen_len = max_len
32 |         self.sen_rnn = nn.LSTM(input_size=input_dim,
33 |                                hidden_size=hidden_dim,
34 |                                num_layers=num_layers,
35 |                                dropout=dropout,
36 |                                batch_first=True,
37 |                                bidirectional=False)
38 | 
39 |         self.output = nn.Linear(self.hidden_dim, output_dim)
40 | 
41 |     def _fetch(self, rnn_outs, seq_lengths, batch_size, max_len):
42 |         rnn_outs = rnn_outs.view(batch_size, max_len, 1, -1)
43 | 
44 |         # (batch_size, max_len, 1, -1)
45 |         fw_out = torch.index_select(rnn_outs, 2, Variable(torch.LongTensor([0])).cuda())
46 |         fw_out = fw_out.view(batch_size * max_len, -1)
47 | 
48 |         batch_range = Variable(torch.LongTensor(range(batch_size))).cuda() * max_len
49 | 
50 |         fw_index = batch_range + seq_lengths.view(batch_size) - 1
51 |         fw_out = torch.index_select(fw_out, 0, fw_index)  # (batch_size, hid)
52 | 
53 |         return fw_out
54 | 
55 |     def forward(self, sen_batch, sen_lengths, sen_mask_matrix):
56 |         """
57 | 
58 |         :param sen_batch: (batch, sen_length), tensor for sentence sequence
59 |         :param sen_lengths:
60 |         :param sen_mask_matrix:
61 |         :return:
62 |         """
63 | 
64 |         ''' Embedding Layer | Padding | Sequence_length 40'''
65 |         sen_batch = self.emb(sen_batch)
66 | 
67 |         batch_size = len(sen_batch)
68 | 
69 |         ''' Bi-LSTM Computation '''
70 |         sen_outs, _ = self.sen_rnn(sen_batch.view(batch_size, -1, self.input_dim))
71 | 
72 |         # Batch_first only change viewpoint, may not be contiguous
73 |         sen_rnn = sen_outs.contiguous().view(batch_size, -1, self.hidden_dim)  # (batch, sen_len, 2*hid)
74 | 
75 |         ''' Fetch the truly last hidden layer of both sides
76 |         '''
77 |         sentence_batch = self._fetch(sen_rnn, sen_lengths, batch_size, self.sen_len)  # (batch_size, hid)
78 | 
79 |         representation = sentence_batch
80 |         out = self.output(representation)
81 |         out_prob = F.softmax(out.view(batch_size, -1))
82 | 
83 |         return out_prob
84 | 


--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding:utf-8
  3 | """
  4 |     This is the old version of vectorization, maybe used in research work instead of engineering one.
  5 |     Tips:
  6 |     - Embeddings are extracted to numpy matrix
  7 |     - Use pickle instead of json file to avoid string variations ???
  8 |     - Vectorization and padding can be done together
  9 | """
 10 | import sys
 11 | import numpy as np
 12 | 
 13 | import yutils
 14 | 
 15 | reload(sys)
 16 | sys.setdefaultencoding('utf-8')
 17 | np.random.seed(1234567)
 18 | 
 19 | #################
 20 | # read text files
 21 | #################
 22 | 
 23 | 
 24 | def read_mr_txt(filename="data/mr/"):
 25 |     """
 26 |     Labeled data format
 27 |         <ID><tab><Sentence>
 28 |     :param filename:
 29 |     :return:
 30 |     """
 31 |     raw_data = yutils.read_file2list(filename)
 32 | 
 33 |     sentences = []
 34 |     labels = []  # 0 1
 35 | 
 36 |     for line in raw_data:
 37 |         label, sentence = line.split("\t")
 38 | 
 39 |         sentences.append(sentence)
 40 |         labels.append(label)
 41 | 
 42 |     assert len(sentences) == len(labels)
 43 |     sentences = yutils.tokenize_sentence(sentences, choice="list")
 44 |     sentences = [yutils.list2string(sentence) for sentence in sentences]
 45 |     return sentences, labels
 46 | 
 47 | #################
 48 | # read embeddings
 49 | #################
 50 | 
 51 | 
 52 | def read_emb_idx(filename):
 53 |     """
 54 |     1.read embeddings files to
 55 |         "embeddings": numpy matrix, each row is a vector with corresponding index
 56 |         "word2idx": word2idx[word] = idx in the "embeddings" matrix
 57 |         "idx2word": the reverse dict of "word2idx"
 58 |     2. add padding and unk to 3 dictionaries
 59 |     :param filename:
 60 |         file format: word<space>emb, '\n' (line[0], line[1:-1], line[-1])
 61 |     :return:
 62 |         vocab = {"embeddings": embeddings, "word2idx": word2idx, "idx2word": idx2word}
 63 |     """
 64 |     with open(filename, 'r') as f:
 65 |         embeddings = []
 66 |         word2idx = dict()
 67 | 
 68 |         word2idx["_padding"] = 0  # PyTorch Embedding lookup need padding to be zero
 69 |         word2idx["_unk"] = 1
 70 | 
 71 |         for line in f:
 72 |             line = line.strip()
 73 |             one = line.split(' ')
 74 |             word = one[0]
 75 |             emb = [float(i) for i in one[1:]]
 76 |             embeddings.append(emb)
 77 |             word2idx[word] = len(word2idx)
 78 | 
 79 |         ''' Add padding and unknown word to embeddings and word2idx'''
 80 |         emb_dim = len(embeddings[0])
 81 |         embeddings.insert(0, np.zeros(emb_dim))  # _padding
 82 |         embeddings.insert(1, np.random.random(emb_dim))  # _unk
 83 | 
 84 |         embeddings = np.asarray(embeddings, dtype=np.float32)
 85 |         embeddings = embeddings.reshape(len(embeddings), emb_dim)
 86 | 
 87 |         idx2word = dict((word2idx[word], word) for word in word2idx)
 88 |         vocab = {"embeddings": embeddings, "word2idx": word2idx, "idx2word": idx2word}
 89 | 
 90 |         print "Finish loading embedding %s * * * * * * * * * * * *" % filename
 91 |         return vocab
 92 | 
 93 | 
 94 | #############################################################
 95 | """ Raw data --> pickle
 96 | output file style looks like this:
 97 |     {"training":{
 98 |         "xIndexes":[]
 99 |         "yLabels":[]
100 |             }
101 |      "validation": ...
102 |      "test": ...
103 |      "word2idx":{"_padding":0,"_unk":1, "1st":2, "hello":3, ...}
104 |      "embedding":[ [word0], [word1], [word2], ...]
105 |     }
106 | """
107 | #################
108 | # evaluation
109 | #################
110 | 
111 | 
112 | def sentence_to_index(word2idx, sentences):
113 |     """
114 |     Transform sentence into lists of word index
115 |     :param word2idx:
116 |         word2idx = {word:idx, ...}
117 |     :param sentences:
118 |         list of sentences which are list of word
119 |     :return:
120 |     """
121 |     print "-------------begin making sentence xIndexes-------------"
122 |     sentences_indexes = []
123 |     for sentence in sentences:
124 |         s_index = []
125 |         for word in sentence:
126 |             word = word
127 |             if word == "\n":
128 |                 continue
129 |             if word in word2idx:
130 |                 s_index.append(word2idx[word])
131 |             else:
132 |                 s_index.append(word2idx["_unk"])
133 |                 print "  --", word, "--  "
134 | 
135 |         if len(s_index) == 0:
136 |             print len(sentence), "+++++++++++++++++++++++++++++++++empty sentence"
137 |             s_index.append(word2idx["_unk"])
138 |         sentences_indexes.append(s_index)
139 |     assert len(sentences_indexes) == len(sentences)
140 |     print "-------------finish making sentence xIndexes-------------"
141 |     return sentences_indexes
142 | 
143 | 
144 | def make_datasets(word2idx, raw_data):
145 |     """
146 |     :param word2idx:
147 |         word2idx = {word:idx, ...}
148 |     :param raw_data:
149 |         raw_data = {"training": (inputs, labels),
150 |                     "validation",
151 |                     "test"}
152 |     :return:
153 |     """
154 |     datasets = dict()
155 | 
156 |     for i in ["training", "validation", "test"]:
157 |         sentences, labels = raw_data[i]
158 |         xIndexes = sentence_to_index(word2idx, sentences)
159 |         yLabels = [int(label) for label in labels]
160 |         yLabels = np.asarray(yLabels, dtype=np.int64).reshape(len(labels))
161 |         datasets[i] = {"xIndexes": xIndexes,
162 |                        "yLabels": yLabels}
163 | 
164 |     return datasets
165 | 
166 | #############################################################
167 | 
168 | 
169 | def processing(args):
170 |     input_dir = "data/mr/"
171 |     output_dir = input_dir
172 |     # read raw text
173 |     data = []  # sentences, labels
174 |     fns = ["data/mr/MR.task.train",
175 |            "data/mr/MR.task.test"]
176 |     for fn in fns:
177 |         # sentences, labels
178 |         sentences = yutils.read_file2lol(fn + ".sentences")
179 |         labels = yutils.read_file2list(fn + ".labels")
180 |         data.append([sentences, labels])
181 | 
182 |     assert len(data[0][0]) == len(data[0][1])
183 |     assert len(data[1][0]) == len(data[1][1])
184 | 
185 |     # split the dataset: train, test
186 |     yutils.shuffle(data[0], seed=123456)
187 |     test = data[1]
188 |     if args.has_valid:
189 |         train_num = int(len(data[0][0]) * 0.8)
190 |         train = [d[:train_num] for d in data[0]]
191 |         valid = [d[train_num:] for d in data[0]]
192 |     else:
193 |         train = data[0]
194 |         valid = test
195 | 
196 |     assert len(train[0]) == len(train[1])
197 |     assert len(valid[0]) == len(valid[1])
198 |     assert len(test[0])  == len(test[1])
199 | 
200 |     raw_data = {"training": train,
201 |                 "validation": valid,
202 |                 "test": test}
203 | 
204 |     # read the embedding files
205 |     run_place = {"hpc": "/users2/jhyuan/", "local": "/Users/Isaac/athand/Code/"}
206 |     emb_file = run_place[args.place] + "nlp_res/embeddings/glove/glove.6B.100d.txt"
207 |     vocab = read_emb_idx(emb_file)
208 |     word2idx, embeddings = vocab["word2idx"], vocab["embeddings"]
209 | 
210 |     # transform sentence to word index
211 |     datasets = make_datasets(word2idx, raw_data)
212 | 
213 |     # output the transformed files
214 |     yutils.dict2pickle(datasets, output_dir + "/features_glove.pkl")
215 |     yutils.dict2pickle(word2idx, output_dir + "/word2idx_glove.pkl")
216 |     yutils.dict2pickle(embeddings, output_dir + "/embeddings_glove.pkl")
217 | 
218 |     # test correctness
219 |     word2idx = yutils.pickle2dict(output_dir + "/word2idx_glove.pkl")
220 |     print word2idx["_padding"], word2idx["_unk"]
221 | 
222 | 
223 | if __name__ == "__main__":
224 |     import argparse
225 | 
226 |     parser = argparse.ArgumentParser(description="Pre-processing Movie Review Dataset")
227 | 
228 |     parser.add_argument("--place", type=str, default="local",
229 |                         help="decide the location of LTP and data")
230 | 
231 |     parser.add_argument("--has_valid", action="store_true",
232 |                         help="whether have 'real' validation data for tuning the model")
233 | 
234 |     my_args = parser.parse_args()
235 | 
236 |     # for fn in ["data/mr/MR.task.train","data/mr/MR.task.test"]:
237 |     #     sentences, labels = read_mr_txt(fn)
238 |     #     yutils.write_list2file(sentences, fn+".sentences")
239 |     #     yutils.write_list2file(labels, fn+".labels")
240 |     processing(my_args)
241 | 
242 | 


--------------------------------------------------------------------------------
/yutils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | """
  3 | A file for utilities used in other files.
  4 | Methods:
  5 |     segment_words
  6 | 
  7 | """
  8 | import sys
  9 | import random
 10 | 
 11 | import numpy as np
 12 | 
 13 | reload(sys)
 14 | sys.setdefaultencoding('utf-8')
 15 | random.seed(1)
 16 | 
 17 | 
 18 | ###############
 19 | # English pre-processing
 20 | ###############
 21 | def tokenize_sentence(senlist, choice="string"):
 22 |     import nltk
 23 |     tokenized_sen = []
 24 |     if choice == "string":
 25 |         for s in senlist:
 26 |             s = s.replace(" #SemST", "")  # remove those irrelevant tags
 27 |             s = s.lower()
 28 |             tokens = nltk.word_tokenize(s)
 29 |             # print type(tokens)
 30 |             tokens = list2string(tokens)
 31 |             tokenized_sen.append(tokens)
 32 |     else:
 33 |         for s in senlist:
 34 |             tokens = nltk.word_tokenize(s)
 35 |             tokens = list(tokens)
 36 |             tokenized_sen.append(tokens)
 37 | 
 38 |     return tokenized_sen
 39 | 
 40 | 
 41 | ###############
 42 | # String Utilities
 43 | ###############
 44 | def list2string(list_of_words, has_blank=True):
 45 |     """covert list of segment words into a single string"""
 46 |     l = list_of_words
 47 |     s = ""
 48 |     if has_blank:
 49 |         for i in l:
 50 |             if i not in set(["\n", " ", "\n\n"]):
 51 |                 s += i + " "
 52 |     else:
 53 |         for i in l:
 54 |             if i != "\n" and i != " " and i != "\n\n":
 55 |                 s += i
 56 |     return s
 57 | 
 58 | 
 59 | def string2list(sentence_in_string):
 60 |     """convert strings with '\n' to list of words without '\n' """
 61 |     return sentence_in_string.strip().split()   # remove last \n
 62 | 
 63 | 
 64 | # contents is a list of Strings
 65 | def write_list2file(contents, filename):
 66 |     s = ''
 67 |     for i in contents:
 68 |         s += (str(i) + "\n")
 69 |     with open(filename, 'w') as f:
 70 |         f.write(s)
 71 |     print "********** Write to file Successfully"
 72 | 
 73 | 
 74 | # read raw text into list (sentence in strings)
 75 | def read_file2list(filename):
 76 |     contents = []
 77 |     with open(filename, 'r') as f:
 78 |         contents = [line.split("\n")[0] for line in f]
 79 |     print "The file has lines: ", len(contents)
 80 |     return contents
 81 | 
 82 | 
 83 | # read segmented corpus into list (sentence in list of words)
 84 | def read_file2lol(filename):
 85 |     with open(filename, 'r') as f:
 86 |         contents = [string2list(line) for line in f]
 87 |     print "The file has lines: ", len(contents)
 88 |     return contents
 89 | 
 90 | 
 91 | # read raw text (seged or tokenized) and get average length of the strings
 92 | def avg_str_len(filename):
 93 |     contents = read_file2lol(filename)
 94 |     num_sentences = len(contents)
 95 |     len_list = [len(sen) for sen in contents]
 96 |     num_words = sum(len_list)
 97 |     words_per_sen = 1.0 * num_words / num_sentences
 98 |     print "%d sentences have %d words, avg=%f" % (num_sentences, num_words, words_per_sen)
 99 |     print "max length = %d  min length = %d" % (max(len_list), min(len_list))
100 |     return words_per_sen
101 | 
102 | 
103 | ###################
104 | #   Serialization to pickle
105 | ###################
106 | def dict2pickle(your_dict, out_file):
107 |     try:
108 |         import cPickle as pickle
109 |     except ImportError:
110 |         import pickle
111 |     with open(out_file, 'wb') as f:
112 |         pickle.dump(your_dict, f)
113 | 
114 | 
115 | def pickle2dict(in_file):
116 |     try:
117 |         import cPickle as pickle
118 |     except ImportError:
119 |         import pickle
120 |     with open(in_file, 'r') as f:
121 |         your_dict = pickle.load(f)
122 |         return your_dict
123 | 
124 | 
125 | def cal_word_freq(corpus, input_format="listoflist"):
126 |     """
127 |     arg: the list of sentence(list of segmented word) 
128 |     
129 |     :return: frequency of given corpus 
130 |     """
131 |     if input_format != "listoflist":
132 |         corpus = [string2list(i) for i in corpus]
133 |     freq = dict()
134 |     for sentence in corpus:
135 |         for word in sentence:
136 |             if word not in freq:
137 |                 freq[word] = 1
138 |             freq[word] += 1
139 |     result = [[freq[word], word] for word in freq]
140 |     revert_result = sorted(result, key=lambda d:d[0], reverse=True)
141 |     print "The word freq of given corpus"
142 |     for i in revert_result:
143 |         print i[0], i[1]
144 |     return [str(i[0]) + " " + str(i[1]) + "\n" for i in revert_result]
145 | 
146 | 
147 | def shuffle(lol, seed=1234567890):
148 |     """
149 |     lol :: list of list as input
150 |     seed :: seed the shuffling
151 |     
152 |     shuffle inplace each list in the same order
153 |     """
154 |     for l in lol:
155 |         random.seed(seed)
156 |         random.shuffle(l)
157 | 
158 | 
159 | def cal_prf(pred, right, gold, formation=True, metric_type=""):
160 |     """
161 |     :param pred: predicted labels
162 |     :param right: predicting right labels
163 |     :param gold: gold labels
164 |     :param formation: whether format the float to 6 digits
165 |     :param metric_type:
166 |     :return: prf for each label
167 |     """
168 |     ''' Pred: [0, 2905, 0]  Right: [0, 2083, 0]  Gold: [370, 2083, 452] '''
169 |     num_class = len(pred)
170 |     precision = [0.0] * num_class
171 |     recall = [0.0] * num_class
172 |     f1_score = [0.0] * num_class
173 | 
174 |     for i in xrange(num_class):
175 |         ''' cal precision for each class: right / predict '''
176 |         precision[i] = 0 if pred[i] == 0 else 1.0 * right[i] / pred[i]
177 | 
178 |         ''' cal recall for each class: right / gold '''
179 |         recall[i] = 0 if gold[i] == 0 else 1.0 * right[i] / gold[i]
180 | 
181 |         ''' cal recall for each class: 2 pr / (p+r) '''
182 |         f1_score[i] = 0 if precision[i] == 0 or recall[i] == 0 \
183 |             else 2.0 * (precision[i] * recall[i]) / (precision[i] + recall[i])
184 | 
185 |         if formation:
186 |             precision[i] = precision[i].__format__(".6f")
187 |             recall[i] = recall[i].__format__(".6f")
188 |             f1_score[i] = f1_score[i].__format__(".6f")
189 | 
190 |     ''' PRF for each label or PRF for all labels '''
191 |     if metric_type == "macro":
192 |         precision = sum(precision) / len(precision)
193 |         recall = sum(recall) / len(recall)
194 |         f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
195 |     elif metric_type == "micro":
196 |         precision = 1.0 * sum(right) / sum(pred) if sum(pred) > 0 else 0
197 |         recall = 1.0 * sum(right) / sum(gold) if sum(recall) > 0 else 0
198 |         f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
199 | 
200 |     return precision, recall, f1_score
201 | 
202 | 
203 | #################
204 | #  Padding, Mask Matrix and NextBatch training
205 | #################
206 | 
207 | 
208 | def get_padding(sentences, max_len):
209 |     """
210 |     :param sentences: raw sentence --> index_padded sentence
211 |                     [2, 3, 4], 5 --> [2, 3, 4, 0, 0]
212 |     :param max_len: number of steps to unroll for a LSTM
213 |     :return: sentence of max_len size with zero paddings
214 |     """
215 |     seq_len = np.zeros((0,))
216 |     padded = np.zeros((0, max_len))
217 |     for sentence in sentences:
218 |         num_words = len(sentence)
219 |         num_pad = max_len - num_words
220 |         ''' Answer 60=45+15'''
221 |         if max_len == 60 and num_words > 60:
222 |             sentence = sentence[:45] + sentence[num_words-15:]
223 |             sentence = np.asarray(sentence, dtype=np.int64).reshape(1, -1)
224 |         else:
225 |             sentence = np.asarray(sentence[:max_len], dtype=np.int64).reshape(1, -1)
226 |         if num_pad > 0:
227 |             zero_paddings = np.zeros((1, num_pad), dtype=np.int64)
228 |             sentence = np.concatenate((sentence, zero_paddings), axis=1)
229 |         else:
230 |             num_words = max_len
231 | 
232 |         padded = np.concatenate((padded, sentence), axis=0)
233 |         seq_len = np.concatenate((seq_len, [num_words]))
234 |     return padded.astype(np.int64), seq_len.astype(np.int64)
235 | 
236 | 
237 | def get_mask_matrix(seq_lengths, max_len):
238 |     """
239 |     [5, 2, 4,... 7], 10 -->
240 |             [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
241 |              ...,
242 |              [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
243 |             ]
244 |     :param seq_lengths:
245 |     :param max_len:
246 |     :return:
247 |     """
248 |     mask_matrix = np.ones((0, max_len))
249 |     for seq_len in seq_lengths:
250 |         num_mask = max_len - seq_len
251 |         mask = np.ones((1, seq_len), dtype=np.int64)
252 |         if num_mask > 0:
253 |             zero_paddings = np.zeros((1, num_mask), dtype=np.int64)
254 |             mask = np.concatenate((mask, zero_paddings), axis=1)
255 |         mask_matrix = np.concatenate((mask_matrix, mask), axis=0)
256 | 
257 |     return mask_matrix.astype(np.int64)
258 | 
259 | 
260 | class YDataset(object):
261 |     def __init__(self, features, labels, to_pad=True, max_len=40):
262 |         """
263 |         All sentences are indexes of words!
264 |         :param features: list containing sequences to be padded and batched
265 |         :param labels:
266 |         """
267 |         self.features = features
268 |         self.labels = labels
269 |         self.pad_max_len = max_len
270 |         self.seq_lens = None
271 |         self.mask_matrix = None
272 | 
273 |         assert len(features) == len(self.labels)
274 | 
275 |         self._num_examples = len(self.labels)
276 |         self._epochs_completed = 0
277 |         self._index_in_epoch = 0
278 | 
279 |         if to_pad:
280 |             if max_len:
281 |                 self._padding()
282 |                 self._mask()
283 |             else:
284 |                 print "Need more information about padding max_length"
285 | 
286 |     def __len__(self):
287 |         return self._num_examples
288 | 
289 |     @property
290 |     def epochs_completed(self):
291 |         return self._epochs_completed
292 | 
293 |     def _padding(self):
294 |         self.features, self.seq_lens = get_padding(self.features, max_len=self.pad_max_len)
295 | 
296 |     def _mask(self):
297 |         self.mask_matrix = get_mask_matrix(self.seq_lens, max_len=self.pad_max_len)
298 | 
299 |     def _shuffle(self, seed):
300 |         """
301 |         After each epoch, the data need to be shuffled
302 |         :return:
303 |         """
304 |         perm = np.arange(self._num_examples)
305 |         np.random.shuffle(perm)
306 | 
307 |         self.features = self.features[perm]
308 |         self.seq_lens = self.seq_lens[perm]
309 |         self.mask_matrix = self.mask_matrix[perm]
310 |         self.labels = self.labels[perm]
311 | 
312 |     def next_batch(self, batch_size, seed=123456):
313 |         """Return the next `batch_size` examples from this data set."""
314 |         start = self._index_in_epoch
315 |         self._index_in_epoch += batch_size
316 |         if self._index_in_epoch > self._num_examples:
317 |             # Finished epoch
318 |             self._epochs_completed += 1
319 |             '''  shuffle feature  and labels'''
320 |             self._shuffle(seed=seed)
321 | 
322 |             start = 0
323 |             self._index_in_epoch = batch_size
324 |             assert batch_size <= self._num_examples
325 |         end = self._index_in_epoch
326 | 
327 |         features = self.features[start:end]
328 |         seq_lens = self.seq_lens[start:end]
329 |         mask_matrix = self.mask_matrix[start:end]
330 |         labels = self.labels[start:end]
331 | 
332 |         return features, seq_lens, mask_matrix, labels
333 | 
334 | 
335 | if __name__ == "__main__":
336 |     print "------------This is for utility test--------------"
337 | 
338 |     avg_str_len("data/mr/MR.task.test")
339 | 


--------------------------------------------------------------------------------