├── .gitignore
├── CC-BY-NC
├── Case Studies
    └── README.md
├── Data
    ├── DataScraping
    │   ├── Getting_links.py
    │   ├── README.md
    │   └── script_for_download_data_of_SCI.ipynb
    ├── ILDC_expert
    │   ├── ILDC_EXPERT_Convert_To_CSV.ipynb
    │   └── README.md
    ├── ILDC_multi
    │   └── README.md
    ├── ILDC_single
    │   └── README.md
    ├── Preprocessing
    │   ├── label_generation_multi.py
    │   ├── label_generation_single.py
    │   └── preprocess.py
    ├── README.md
    └── images
    │   └── stats.png
├── LICENSE
├── Models
    ├── CNN
    │   ├── CNN__final.ipynb
    │   └── README.md
    ├── Classical Models
    │   ├── README.md
    │   ├── classical_models_doc2vec.py
    │   └── classical_models_sent2vec_avgd.py
    ├── Explanations
    │   ├── README.md
    │   ├── XLNet_noatt_2layer_occlusion.ipynb
    │   ├── metrics and results
    │   │   ├── README.md
    │   │   ├── among users scores
    │   │   │   ├── README.md
    │   │   │   ├── resultsamongusers.xlsx
    │   │   │   └── uservsusermetric.ipynb
    │   │   ├── anno_explanations_scores.xlsx
    │   │   ├── gold_explanations.json
    │   │   ├── gold_explanations_ranked.json
    │   │   ├── metricmaker.py
    │   │   ├── occ_explanations.json
    │   │   ├── result_files
    │   │   │   ├── Rank_10_to_10.txt
    │   │   │   ├── Rank_1_to_1.txt
    │   │   │   ├── Rank_1_to_10.txt
    │   │   │   ├── Rank_1_to_5.txt
    │   │   │   ├── Rank_2_to_2.txt
    │   │   │   ├── Rank_3_to_3.txt
    │   │   │   ├── Rank_4_to_4.txt
    │   │   │   ├── Rank_5_to_10.txt
    │   │   │   ├── Rank_5_to_5.txt
    │   │   │   ├── Rank_6_to_6.txt
    │   │   │   ├── Rank_7_to_7.txt
    │   │   │   ├── Rank_8_to_8.txt
    │   │   │   └── Rank_9_to_9.txt
    │   │   └── xl_anno_make.py
    │   └── occ_explanations_hierarchical.ipynb
    ├── README.md
    ├── Sequential_Models
    │   ├── BIGRU_Attention_final.ipynb
    │   ├── BIGRU_final.ipynb
    │   ├── CatchPhrase
    │   │   └── README.md
    │   ├── HAN_final.ipynb
    │   └── README.md
    └── transformers
    │   ├── concatenated
    │       ├── README.md
    │       ├── XLNet_full_concat_results.ipynb
    │       └── concat_XLNet_embeddings_maker.ipynb
    │   ├── trained_on_multi
    │       ├── BERT_training_notebook.ipynb
    │       ├── DistilBERT_training_notebook.ipynb
    │       ├── README.md
    │       ├── RoBERTa_training_notebook.ipynb
    │       └── XLNet_training_notebook.ipynb
    │   ├── trained_on_single
    │       ├── BERT_on_single.ipynb
    │       ├── README.md
    │       ├── RoBERTa_on_single.ipynb
    │       └── XLNet_on_single.ipynb
    │   └── voting ensemble
    │       ├── README.md
    │       ├── VE_RoBERTa.ipynb
    │       └── VE_XLNet.ipynb
├── README.md
└── Results
    ├── Attention Score vs Averaged Chunk Size all 25.jpg
    ├── BERT visualization.jpg
    ├── BERT_tsne_test.png
    ├── BLEU.png
    ├── Doc2Vec_tsne_test.png
    ├── JACCARD SIMILARITY.png
    ├── METEOR.png
    ├── OVERLAP-MAX.png
    ├── OVERLAP-MIN.png
    ├── Occlusion Score vs Averaged Chunk Size all 25.jpg
    ├── README.md
    ├── ROUGE-1.png
    ├── ROUGE-2.png
    └── stats.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | .DS_Store
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 


--------------------------------------------------------------------------------
/CC-BY-NC:
--------------------------------------------------------------------------------
  1 | Creative Commons Attribution-NonCommercial 4.0 International
  2 | 
  3 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  4 | does not provide legal services or legal advice. Distribution of
  5 | Creative Commons public licenses does not create a lawyer-client or
  6 | other relationship. Creative Commons makes its licenses and related
  7 | information available on an "as-is" basis. Creative Commons gives no
  8 | warranties regarding its licenses, any material licensed under their
  9 | terms and conditions, or any related information. Creative Commons
 10 | disclaims all liability for damages resulting from their use to the
 11 | fullest extent possible.
 12 | 
 13 | Using Creative Commons Public Licenses
 14 | 
 15 | Creative Commons public licenses provide a standard set of terms and
 16 | conditions that creators and other rights holders may use to share
 17 | original works of authorship and other material subject to copyright and
 18 | certain other rights specified in the public license below. The
 19 | following considerations are for informational purposes only, are not
 20 | exhaustive, and do not form part of our licenses.
 21 | 
 22 | -   Considerations for licensors: Our public licenses are intended for
 23 |     use by those authorized to give the public permission to use
 24 |     material in ways otherwise restricted by copyright and certain other
 25 |     rights. Our licenses are irrevocable. Licensors should read and
 26 |     understand the terms and conditions of the license they choose
 27 |     before applying it. Licensors should also secure all rights
 28 |     necessary before applying our licenses so that the public can reuse
 29 |     the material as expected. Licensors should clearly mark any material
 30 |     not subject to the license. This includes other CC-licensed
 31 |     material, or material used under an exception or limitation to
 32 |     copyright. More considerations for licensors :
 33 |     wiki.creativecommons.org/Considerations\_for\_licensors
 34 | 
 35 | -   Considerations for the public: By using one of our public licenses,
 36 |     a licensor grants the public permission to use the licensed material
 37 |     under specified terms and conditions. If the licensor's permission
 38 |     is not necessary for any reason–for example, because of any
 39 |     applicable exception or limitation to copyright–then that use is not
 40 |     regulated by the license. Our licenses grant only permissions under
 41 |     copyright and certain other rights that a licensor has authority to
 42 |     grant. Use of the licensed material may still be restricted for
 43 |     other reasons, including because others have copyright or other
 44 |     rights in the material. A licensor may make special requests, such
 45 |     as asking that all changes be marked or described. Although not
 46 |     required by our licenses, you are encouraged to respect those
 47 |     requests where reasonable. More considerations for the public :
 48 |     wiki.creativecommons.org/Considerations\_for\_licensees
 49 | 
 50 | Creative Commons Attribution-NonCommercial 4.0 International Public
 51 | License
 52 | 
 53 | By exercising the Licensed Rights (defined below), You accept and agree
 54 | to be bound by the terms and conditions of this Creative Commons
 55 | Attribution-NonCommercial 4.0 International Public License ("Public
 56 | License"). To the extent this Public License may be interpreted as a
 57 | contract, You are granted the Licensed Rights in consideration of Your
 58 | acceptance of these terms and conditions, and the Licensor grants You
 59 | such rights in consideration of benefits the Licensor receives from
 60 | making the Licensed Material available under these terms and conditions.
 61 | 
 62 | -   Section 1 – Definitions.
 63 | 
 64 |     -   a. Adapted Material means material subject to Copyright and
 65 |         Similar Rights that is derived from or based upon the Licensed
 66 |         Material and in which the Licensed Material is translated,
 67 |         altered, arranged, transformed, or otherwise modified in a
 68 |         manner requiring permission under the Copyright and Similar
 69 |         Rights held by the Licensor. For purposes of this Public
 70 |         License, where the Licensed Material is a musical work,
 71 |         performance, or sound recording, Adapted Material is always
 72 |         produced where the Licensed Material is synched in timed
 73 |         relation with a moving image.
 74 |     -   b. Adapter's License means the license You apply to Your
 75 |         Copyright and Similar Rights in Your contributions to Adapted
 76 |         Material in accordance with the terms and conditions of this
 77 |         Public License.
 78 |     -   c. Copyright and Similar Rights means copyright and/or similar
 79 |         rights closely related to copyright including, without
 80 |         limitation, performance, broadcast, sound recording, and Sui
 81 |         Generis Database Rights, without regard to how the rights are
 82 |         labeled or categorized. For purposes of this Public License, the
 83 |         rights specified in Section 2(b)(1)-(2) are not Copyright and
 84 |         Similar Rights.
 85 |     -   d. Effective Technological Measures means those measures that,
 86 |         in the absence of proper authority, may not be circumvented
 87 |         under laws fulfilling obligations under Article 11 of the WIPO
 88 |         Copyright Treaty adopted on December 20, 1996, and/or similar
 89 |         international agreements.
 90 |     -   e. Exceptions and Limitations means fair use, fair dealing,
 91 |         and/or any other exception or limitation to Copyright and
 92 |         Similar Rights that applies to Your use of the Licensed
 93 |         Material.
 94 |     -   f. Licensed Material means the artistic or literary work,
 95 |         database, or other material to which the Licensor applied this
 96 |         Public License.
 97 |     -   g. Licensed Rights means the rights granted to You subject to
 98 |         the terms and conditions of this Public License, which are
 99 |         limited to all Copyright and Similar Rights that apply to Your
100 |         use of the Licensed Material and that the Licensor has authority
101 |         to license.
102 |     -   h. Licensor means the individual(s) or entity(ies) granting
103 |         rights under this Public License.
104 |     -   i. NonCommercial means not primarily intended for or directed
105 |         towards commercial advantage or monetary compensation. For
106 |         purposes of this Public License, the exchange of the Licensed
107 |         Material for other material subject to Copyright and Similar
108 |         Rights by digital file-sharing or similar means is NonCommercial
109 |         provided there is no payment of monetary compensation in
110 |         connection with the exchange.
111 |     -   j. Share means to provide material to the public by any means or
112 |         process that requires permission under the Licensed Rights, such
113 |         as reproduction, public display, public performance,
114 |         distribution, dissemination, communication, or importation, and
115 |         to make material available to the public including in ways that
116 |         members of the public may access the material from a place and
117 |         at a time individually chosen by them.
118 |     -   k. Sui Generis Database Rights means rights other than copyright
119 |         resulting from Directive 96/9/EC of the European Parliament and
120 |         of the Council of 11 March 1996 on the legal protection of
121 |         databases, as amended and/or succeeded, as well as other
122 |         essentially equivalent rights anywhere in the world.
123 |     -   l. You means the individual or entity exercising the Licensed
124 |         Rights under this Public License. Your has a corresponding
125 |         meaning.
126 | 
127 | -   Section 2 – Scope.
128 | 
129 |     -   a. License grant.
130 |         -   1. Subject to the terms and conditions of this Public
131 |             License, the Licensor hereby grants You a worldwide,
132 |             royalty-free, non-sublicensable, non-exclusive, irrevocable
133 |             license to exercise the Licensed Rights in the Licensed
134 |             Material to:
135 |             -   A. reproduce and Share the Licensed Material, in whole
136 |                 or in part, for NonCommercial purposes only; and
137 |             -   B. produce, reproduce, and Share Adapted Material for
138 |                 NonCommercial purposes only.
139 |         -   2. Exceptions and Limitations. For the avoidance of doubt,
140 |             where Exceptions and Limitations apply to Your use, this
141 |             Public License does not apply, and You do not need to comply
142 |             with its terms and conditions.
143 |         -   3. Term. The term of this Public License is specified in
144 |             Section 6(a).
145 |         -   4. Media and formats; technical modifications allowed. The
146 |             Licensor authorizes You to exercise the Licensed Rights in
147 |             all media and formats whether now known or hereafter
148 |             created, and to make technical modifications necessary to do
149 |             so. The Licensor waives and/or agrees not to assert any
150 |             right or authority to forbid You from making technical
151 |             modifications necessary to exercise the Licensed Rights,
152 |             including technical modifications necessary to circumvent
153 |             Effective Technological Measures. For purposes of this
154 |             Public License, simply making modifications authorized by
155 |             this Section 2(a)(4) never produces Adapted Material.
156 |         -   5. Downstream recipients.
157 |             -   A. Offer from the Licensor – Licensed Material. Every
158 |                 recipient of the Licensed Material automatically
159 |                 receives an offer from the Licensor to exercise the
160 |                 Licensed Rights under the terms and conditions of this
161 |                 Public License.
162 |             -   B. No downstream restrictions. You may not offer or
163 |                 impose any additional or different terms or conditions
164 |                 on, or apply any Effective Technological Measures to,
165 |                 the Licensed Material if doing so restricts exercise of
166 |                 the Licensed Rights by any recipient of the Licensed
167 |                 Material.
168 |         -   6. No endorsement. Nothing in this Public License
169 |             constitutes or may be construed as permission to assert or
170 |             imply that You are, or that Your use of the Licensed
171 |             Material is, connected with, or sponsored, endorsed, or
172 |             granted official status by, the Licensor or others
173 |             designated to receive attribution as provided in Section
174 |             3(a)(1)(A)(i).
175 |     -   b. Other rights.
176 |         -   1. Moral rights, such as the right of integrity, are not
177 |             licensed under this Public License, nor are publicity,
178 |             privacy, and/or other similar personality rights; however,
179 |             to the extent possible, the Licensor waives and/or agrees
180 |             not to assert any such rights held by the Licensor to the
181 |             limited extent necessary to allow You to exercise the
182 |             Licensed Rights, but not otherwise.
183 |         -   2. Patent and trademark rights are not licensed under this
184 |             Public License.
185 |         -   3. To the extent possible, the Licensor waives any right to
186 |             collect royalties from You for the exercise of the Licensed
187 |             Rights, whether directly or through a collecting society
188 |             under any voluntary or waivable statutory or compulsory
189 |             licensing scheme. In all other cases the Licensor expressly
190 |             reserves any right to collect such royalties, including when
191 |             the Licensed Material is used other than for NonCommercial
192 |             purposes.
193 | 
194 | -   Section 3 – License Conditions.
195 | 
196 |     Your exercise of the Licensed Rights is expressly made subject to
197 |     the following conditions.
198 | 
199 |     -   a. Attribution.
200 |         -   1. If You Share the Licensed Material (including in modified
201 |             form), You must:
202 |             -   A. retain the following if it is supplied by the
203 |                 Licensor with the Licensed Material:
204 |                 -   i. identification of the creator(s) of the Licensed
205 |                     Material and any others designated to receive
206 |                     attribution, in any reasonable manner requested by
207 |                     the Licensor (including by pseudonym if designated);
208 |                 -   ii. a copyright notice;
209 |                 -   iii. a notice that refers to this Public License;
210 |                 -   iv. a notice that refers to the disclaimer of
211 |                     warranties;
212 |                 -   v. a URI or hyperlink to the Licensed Material to
213 |                     the extent reasonably practicable;
214 |             -   B. indicate if You modified the Licensed Material and
215 |                 retain an indication of any previous modifications; and
216 |             -   C. indicate the Licensed Material is licensed under this
217 |                 Public License, and include the text of, or the URI or
218 |                 hyperlink to, this Public License.
219 |         -   2. You may satisfy the conditions in Section 3(a)(1) in any
220 |             reasonable manner based on the medium, means, and context in
221 |             which You Share the Licensed Material. For example, it may
222 |             be reasonable to satisfy the conditions by providing a URI
223 |             or hyperlink to a resource that includes the required
224 |             information.
225 |         -   3. If requested by the Licensor, You must remove any of the
226 |             information required by Section 3(a)(1)(A) to the extent
227 |             reasonably practicable.
228 |         -   4. If You Share Adapted Material You produce, the Adapter's
229 |             License You apply must not prevent recipients of the Adapted
230 |             Material from complying with this Public License.
231 | 
232 | -   Section 4 – Sui Generis Database Rights.
233 | 
234 |     Where the Licensed Rights include Sui Generis Database Rights that
235 |     apply to Your use of the Licensed Material:
236 | 
237 |     -   a. for the avoidance of doubt, Section 2(a)(1) grants You the
238 |         right to extract, reuse, reproduce, and Share all or a
239 |         substantial portion of the contents of the database for
240 |         NonCommercial purposes only;
241 |     -   b. if You include all or a substantial portion of the database
242 |         contents in a database in which You have Sui Generis Database
243 |         Rights, then the database in which You have Sui Generis Database
244 |         Rights (but not its individual contents) is Adapted Material;
245 |         and
246 |     -   c. You must comply with the conditions in Section 3(a) if You
247 |         Share all or a substantial portion of the contents of the
248 |         database.
249 | 
250 |     For the avoidance of doubt, this Section 4 supplements and does not
251 |     replace Your obligations under this Public License where the
252 |     Licensed Rights include other Copyright and Similar Rights.
253 | 
254 | -   Section 5 – Disclaimer of Warranties and Limitation of Liability.
255 | 
256 |     -   a. Unless otherwise separately undertaken by the Licensor, to
257 |         the extent possible, the Licensor offers the Licensed Material
258 |         as-is and as-available, and makes no representations or
259 |         warranties of any kind concerning the Licensed Material, whether
260 |         express, implied, statutory, or other. This includes, without
261 |         limitation, warranties of title, merchantability, fitness for a
262 |         particular purpose, non-infringement, absence of latent or other
263 |         defects, accuracy, or the presence or absence of errors, whether
264 |         or not known or discoverable. Where disclaimers of warranties
265 |         are not allowed in full or in part, this disclaimer may not
266 |         apply to You.
267 |     -   b. To the extent possible, in no event will the Licensor be
268 |         liable to You on any legal theory (including, without
269 |         limitation, negligence) or otherwise for any direct, special,
270 |         indirect, incidental, consequential, punitive, exemplary, or
271 |         other losses, costs, expenses, or damages arising out of this
272 |         Public License or use of the Licensed Material, even if the
273 |         Licensor has been advised of the possibility of such losses,
274 |         costs, expenses, or damages. Where a limitation of liability is
275 |         not allowed in full or in part, this limitation may not apply to
276 |         You.
277 |     -   c. The disclaimer of warranties and limitation of liability
278 |         provided above shall be interpreted in a manner that, to the
279 |         extent possible, most closely approximates an absolute
280 |         disclaimer and waiver of all liability.
281 | 
282 | -   Section 6 – Term and Termination.
283 | 
284 |     -   a. This Public License applies for the term of the Copyright and
285 |         Similar Rights licensed here. However, if You fail to comply
286 |         with this Public License, then Your rights under this Public
287 |         License terminate automatically.
288 |     -   b. Where Your right to use the Licensed Material has terminated
289 |         under Section 6(a), it reinstates:
290 | 
291 |         -   1. automatically as of the date the violation is cured,
292 |             provided it is cured within 30 days of Your discovery of the
293 |             violation; or
294 |         -   2. upon express reinstatement by the Licensor.
295 | 
296 |         For the avoidance of doubt, this Section 6(b) does not affect
297 |         any right the Licensor may have to seek remedies for Your
298 |         violations of this Public License.
299 | 
300 |     -   c. For the avoidance of doubt, the Licensor may also offer the
301 |         Licensed Material under separate terms or conditions or stop
302 |         distributing the Licensed Material at any time; however, doing
303 |         so will not terminate this Public License.
304 |     -   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
305 |         License.
306 | 
307 | -   Section 7 – Other Terms and Conditions.
308 | 
309 |     -   a. The Licensor shall not be bound by any additional or
310 |         different terms or conditions communicated by You unless
311 |         expressly agreed.
312 |     -   b. Any arrangements, understandings, or agreements regarding the
313 |         Licensed Material not stated herein are separate from and
314 |         independent of the terms and conditions of this Public License.
315 | 
316 | -   Section 8 – Interpretation.
317 | 
318 |     -   a. For the avoidance of doubt, this Public License does not, and
319 |         shall not be interpreted to, reduce, limit, restrict, or impose
320 |         conditions on any use of the Licensed Material that could
321 |         lawfully be made without permission under this Public License.
322 |     -   b. To the extent possible, if any provision of this Public
323 |         License is deemed unenforceable, it shall be automatically
324 |         reformed to the minimum extent necessary to make it enforceable.
325 |         If the provision cannot be reformed, it shall be severed from
326 |         this Public License without affecting the enforceability of the
327 |         remaining terms and conditions.
328 |     -   c. No term or condition of this Public License will be waived
329 |         and no failure to comply consented to unless expressly agreed to
330 |         by the Licensor.
331 |     -   d. Nothing in this Public License constitutes or may be
332 |         interpreted as a limitation upon, or waiver of, any privileges
333 |         and immunities that apply to the Licensor or You, including from
334 |         the legal processes of any jurisdiction or authority.
335 | 
336 | Creative Commons is not a party to its public licenses. Notwithstanding,
337 | Creative Commons may elect to apply one of its public licenses to
338 | material it publishes and in those instances will be considered the
339 | "Licensor." The text of the Creative Commons public licenses is
340 | dedicated to the public domain under the CC0 Public Domain Dedication.
341 | Except for the limited purpose of indicating that material is shared
342 | under a Creative Commons public license or as otherwise permitted by the
343 | Creative Commons policies published at creativecommons.org/policies,
344 | Creative Commons does not authorize the use of the trademark "Creative
345 | Commons" or any other trademark or logo of Creative Commons without its
346 | prior written consent including, without limitation, in connection with
347 | any unauthorized modifications to any of its public licenses or any
348 | other arrangements, understandings, or agreements concerning use of
349 | licensed material. For the avoidance of doubt, this paragraph does not
350 | form part of the public licenses.
351 | 
352 | Creative Commons may be contacted at creativecommons.org.
353 | 


--------------------------------------------------------------------------------
/Case Studies/README.md:
--------------------------------------------------------------------------------
 1 | # Case Studies
 2 | 
 3 | ## Corpus Creation
 4 | 
 5 | We did not divide the cases in the corpus into train, development and test set based on any temporal consideration or stratification, because the objective of the system that may eventually emerge from the project is not meant to be limited to any particular law or laws, nor focused on any particular period of time. On the contrary, the aim is to identify common features of judgments pronounced in relation to various legislation, by different judges, and across different temporal phases, so as to be able to use the said features to decipher the judicial decision-making process and successfully predict the nature of the order finally pronounced by the court given a set of facts and legal arguments. While there would be a degree of subjectivity involved given the difference in the thoughts and interpretations adopted by different judges, such differences are to be also found between two judges who are contemporaries of each other, as much as between two judges who have pronounced judgments on similar matters across a gap of decades. The focus is therefore to develop a system that would be equally successful in predicting the outcome of a judgment given the law that had been in vogue twenty years back, as it would in relation to law that is currently in practice. The validity and efficacy of the system can therefore be equally tested by applying it to cases from years back, as to cases from more recent period. In fact, if the system cannot be temporally independent, and remains limited to only successful prediction of contemporary judgments, then it is likely to fail any test of application, because by the time the final version of the system can be ready for practical application on a large scale, the laws might get amended or replaced, and therefore, the judgments that would subsequently be rendered by the court might be as different from one pronounced today, as the latter might differ from one pronounced in the twentieth century. Not acknowledging time as a factor during data sample choice, therefore, appears to be the prudent step in this case, especially given the exponential rate at which legislation is getting amended today, as well as the fast-paced growth of technological development.
 6 | 
 7 | ## Explainability Annotations
 8 | 
 9 | Each document was annotated by 5 experts (in isolation) using the WebAnno framework. The annotators could assign ranks to the sentences selected as explanations; a higher rank indicates more importance than the final judgment. The rationale for rank assignment to the sentences is as follows. Rank 1 was given to sentences immediately leading to the decision. Rank 2 was assigned to sentences that contributed to the decision. Rank 3 was given to sentences indicative of the disagreement of the current court with a lower court/tribunal decision. Sentences containing the facts of the case, not immediately, leading to decision making, but are essential for the case were assigned Rank 4 (or lower). Note in practice, only a small set of sentences of a document were assigned a rank.
10 | 
11 | ## An example case description
12 | 
13 | Das Gupta, J. This appeal by special leave is against the order of the High Court at Allahabad under s. 133 of the Code of Criminal Procedure. The three appellants carry on the trade of auctioning vegetables. These vegetables, it appears, are brought in carts which are parked on the public road outside the building where the auctioning takes place. There was some dispute between these appellants and the Municipal Board which it is suggested by the appellants was really behind the move to get this order under s. 133 passed against them. It is unnecessary, however, for us to consider that matter. What appears to be clear is that the trade is carried on in a private house in the subzimandi quarter and it does happen that some amount of inconvenience is caused to people who pass by the public road because of the carts which necessarily come near this house. The real question is, whether because this trade of auctioning vegetables which the appellants carry on in their private house produce the consequence that people passing by the road are put to inconvenience, action can be taken under s. 133 of the Code of Criminal Procedure. 
14 | 
15 |    The High Court seems to be of the opinion - when it is clear that the business of auctioning vegetables cannot be carried on without causing obstruction to the passers by, the conduct of the business can be prohibited, even though it is carried on in a private place. It seems to us that this proposition has been put too widely. Section 133 of the Code of Criminal Procedure empowers action by the District Magistrate, Sub-Divisional Magistrate of Magistrate 1st class to remove public nuisances in certain circumstances. Two out of the several cls. of s. 133(1) in which these circumstances are set out, with which we are concerned, are the first and second clauses. The first clause provides for action by Magistrate where he considers, on receiving a police-report or other information and on taking such evidence as he thinks fit, that any unlawful obstruction or nuisance should be removed from any way, river or channel which is or may be lawfully used by the public or from any public place. The second clause deals with the position where the conduct of any trade or occupation or the keeping of any goods or merchandise, is injurious to the health or physical comfort of the community and that in consequence such trade or occupation should be prohibited or regulated or such goods or merchandise should be removed or the keeping thereof regulated. It is difficult to see how the first clause can have any application.
16 |     
17 |    Unlawful obstruction, if any, is certainly not caused by the people who carry on the trade of auctioning. If the obstruction caused by keeping the carts on the road can be considered to be unlawful obstruction within the meaning of this clause - about which we express no opinion action can be taken against the persons causing such obstruction. The obvious difficulty in the way of that might be that the persons who bring the carts are not the same from day do day. But whether or not any action is possible under s. 133 against the persons bringing the carts, we are unable to agree that merely because the appellants carry on auctioning in connection with which the carts are brought, they can be considered to have caused the obstruction. In our opinion, the appellants cannot be considered to be the persons causing obstruction. Turning now to the next clause, the question arises how the conduct of this auctioning trade is injurious to the health or physical comfort of the community. Undoubtedly, some amount of noise the auction is going on. That however is a necessary concomitant of buying and selling large quantities and it will be unreasonable to think that merely because some amount of noise is caused which people preferring perfect peace may not like, this is injurious to the physical comfort, or health of the community. It appears to us that the conduct of trades of this nature and indeed of other trades in localities of a city where such trades are usually carried on, is bound to produce some discomfort, though at the same time resulting perhaps in the good of the community in other respects. If a trade like auctioning which has to be carried on as necessary for the well being of the community, some amount of noise has to be borne in at least that part of the town where such trade is ordinarily carried on. 
18 |    
19 |    In making the provisions of s. 133 of the Code of Criminal Procedure, the legislature cannot have intended the stoppage of such trades in such part of the town, merely because of the discomfort caused by the noise in carrying on the trade. In our opinion therefore, the slight discomfort that may be caused to some people passing by the road or living in the neighbourhood cannot ordinarily be considered to be such as to justify action under s. 133 of the Code of Criminal Procedure. We do not think that the orders are justified under s. 133.
20 |    
21 |    DECISION ??
22 | 
23 | ## Agreement in judgment prediction for annotators
24 | 
25 | * __*Annotation Assignment 1954_13:*__ In this case, although the original decision is that the appeal has been rejected, Users 1-4 have reached the decision that it has been accepted, while User 5 has decided that it has been rejected. This discrepancy appears to owe it origin to the very nature of the case and the issues considered by the court. There had been more than one such issue and separate arguments had been made by appellant in favour of each of such issue and associated prayer. The court appears to have agreed to some of the arguments and disagreed with the rest. Given the binary nature of the choice available to the Users, they could therefore decided the matter either way, as seems to have been done. The exact decision might therefore have been better described as partially accepted/partially rejected –however, this alternative option had not been made available. This indicates that for cases involving multiple issues, providing such a third option might yield more accurate results and uniformity of user-decision.
26 | 
27 | * __*Annotation Assignment 1961\_417:*__ In this case, although the original decision is that the appeal has been rejected, Users 2 and 4 have decided that it has been accepted. User 2 appears to have misconstrued certain positions of law and relied unduly upon one of the other cases being cited as precedent (but not considered relevant by the Supreme Court), which might account for the divergence. In case of User 4, however, the issue appears to be more of a linguistic matter. User 4 has referred to a particular statement made by the court, “The main question that arises in this appeal is whether an illegitimate son of a sudra vis-a-vis his self acquired property, after having succeeded to a half share of his putative fathers estate, will be entitled to succeed to the other half share got by the widow, after the succession opened out to his putative father on the death of the said widow.” From this sentence, User 4 has drawn the inference that the appellant was the one asking to establish such entitlement. Since the court in subsequent comments agreed that such entitlement does exist, User 4 inferred that the appeal had been accepted. However, in reality, the appellant had been contesting such entitlement. The court’s singular way of drafting the issue under contention, together with User 4 not having studied more thoroughly the facts leading to the case in the first place (which might have helped the user to reach a conclusion about the appellant’s prayers more accurately) appear to have contributed to this discrepancy.
28 |    
29 | * __*Annotation Assignment 1962\_47:*__ In this case, although the original decision is that the appeal has been rejected, Users 2 and 5 have decided that it has been accepted. This discrepancy appears to owe its origin to both of them having been misled by Sentence 17 of the case, which appears to refer to the Supreme Court having accepted an appeal and merely giving reasons for such order in the present case. However, the case in point was actually arising from an application for review of the court’s earlier judgment (acceptance of the appeal), and therefore, when the court was affirming its earlier judgment and giving reasons behind it, it was in reality rejecting this present application for review, that had been made by the party (respondent in the original appeal) aggrieved by the acceptance of such appeal by the court earlier. Users 2 and 5 could not apparently distinguish the appeal from the review petition and that appears to have led to such discrepancy.
30 | 
31 | 
32 | 
33 | ## Example explanation annotation
34 | 
35 | The justification for each annotator (users) for the assignment of the choice of explanation sentences (with the associated ranks) are given below.
36 | 
37 | _**User 1**_: The case is by way of an appeal from an order of the High Court u/s 133 CrPC. Based on sentences 20 (“In our opinion, the appellants cannot be considered to be the persons causing obstruction”) and 24-28 (especially 27 and 28: In our opinion therefore, the slight discomfort that may be caused to some people passing by the road and living in the neighbourhood cannot ordinarily be considered to be such as to justify action under S. 133 of the CrPC. We do not think that the orders are justified under S. 133.''), it seems clear that the Supreme Court is differing with the order passed by the High Court (the fact situation and prior trial results from the High Court make it clear that the High Court had taken an action under S. 133 that has aggrieved the appellants in this case) and favouring the appellants. Therefore, it appears that the Supreme Court has decided to accept the appeal. The aforementioned sentences (20, 24-28) clearly indicate that the court is considering the appellants’ contentions and action favourably. Interpretation of the legislative intention in Sentence 26 and justification of appellants’ action by way of facts in Sentences 24 and 25 all point to such, along with Sentences 20, 27 and 28 as described above. Hence these are considered Rank 1 sentences. Similarly, Sentences 15-19 show how the court is considering the provisions of S. 133, particularly clause (1) thereof, applying those to the facts at hand, and arguing that the appellants’ actions cannot be considered as causing obstruction. These sentences lead to the opinion in Sentence 20, and have also been considered as Rank 1. Sentence 10 also makes it clear that the court is opining that the High Court has considered too wide an interpretation of a proposition, thus highlighting the divergence of opinion further. Hence this sentence is also considered as Rank 1. 
38 | In Sentence 7, the issue of causing obstruction has been identified as the main contention –without this, it would have been difficult to make the subsequent reasoning, as referred above. Hence this sentence is also considered as Rank 1.
39 | Sentences 5, 6, 8, 9, 11-14, 21-23 highlight other relevant portions of the fact scenario, the description of the legislative provision involved, and rejection by the Supreme Court of some of the arguments made by the respondents and apparently accepted by the High Court. While not of prime importance, these sentences are nonetheless relevant and have therefore been marked Rank 2. 
40 | Finally, the exact nature of the case being by way of an appeal from an order of the High Court under S. 133 CrPC is mentioned in Sentence 2, which is considered Rank 3, since it lays the foundation for the entire discussion.
41 | 
42 | _**User 2:**_ This is given in the table below.
43 | 
44 | Sentence No. | Rank assigned | Reasoning
45 | -------------|---------------|-----------
46 | 7	| 2	| This sentence deals with the undisputed facts of the case that the court has accepted and will base their decision on.
47 | 8	| 2	| This is the fundamental question that the court will need to answer after looking at the facts of the case and determine whether the conditions of Section 133 CrPC are satisfied or not.
48 | 11	| 3	| This sentence states that the claim against the vegetable vendors is one of public nuisance under Section 133.
49 | 12	| 2	| This sentence sets out that the case can only be considered under either clause (a) or clause (b) of Section 133(1) of the CrPC, thereby restricting the scope of enquiry.
50 | 16	| 3	| This sentence analyses why clause (a) of Section 133(1) does not have any application in this case. This sentence does not directly affect the judgment that will be reached by the court.
51 | 17	| 3 | This sentence states that even if there was an obstruction caused, this judgment will not deal with it as it would not be inapplicable against the vegetable vendors. This sentence does not directly affect the judgment that will be reached by the court.
52 | 18	 | 3	| The court states the practical difficulty in enforcing a Section 133(1)(a) order. This sentence does not directly affect the judgment that will be reached by the court.
53 | 20	 | 1	| This sentence concluded that the vegetable vendors did not infringe Section 133(1)(a).
54 | 22, 23	| 1	| This sentence lays the primary reasoning used by the court to dismiss any claims relating to physical comfort or the health of the community under Section 133(1)(b).
55 | 26	| 1	| The court interprets the intent of the legislation while formulating Section 133(1)(b) and hold that it does not affect the right to trade. This helps buttress the primary reasoning by clarifying the legislative intent of this Section.
56 | 27	 | 1	| The court’s final holding is that there is no violation under Section 133 of the CrPC. This helps in deciding whether the court accepted or rejected the case.
57 | 29	 | 1	| Since this was an appeal against an order of the High Court allowing an order under Section 133 of the CrPC and this court dismissed the claims arising from Section 133(1)(a) and (b), the court therefore, accepted the special leave appeal filed by the vegetable vendors.
58 | 
59 | _**User 3:**_ The sentences were ranked taking into account their relative importance with respect to the final decision regarding the appeal. Sentence 1 simply stated the name of the judge authoring the decision of the court. It is unmarked since ideally who the judge is should not influence judicial rationale and further no personal subjectivity was specifically noticeable in the case. Since sentence 2 was merely stating the nature of the case i.e. appeal to the Supreme Court, and the charging provision under law, it was left unranked as it was not specifically factoring into the decision. Sentences 5 and 6 were also unranked. This was because though a dispute was mentioned in sentence 5, sentence 6 mentioned that the court did not take it into consideration.
60 | The background facts are laid out in Sentences 3, 4 and 7. Since facts are essentially established at trial court level and matters at appellate level, particularly when before the Supreme Court, are to be decided on the basis of legal points alone, these have not been considered absolutely crucial to the final decision. However, application of law to facts is always relevant, specifically given that the case at hand was dealing with a criminal charge. Sentences 3 and 4 provide supplementary background information and are hence ranked at Rank 3. The facts more closely related to the cause of action are provided in Sentence 7, so it is marked at Rank 2.
61 | Sentence 8 provides the key issue under consideration of the court. The question is very relevant with reference to the answer given in the decision, so it is provided Rank 1. Sentence 9 having the position of the High Court which is challenged is provided Rank 2 as though worth considering, its veracity is ultimately under question in the appeal. Sentence 10 however, is Rank 1 as the Supreme Court specifically explains its stance regarding the High Court’s position therein. Sentences 11 and 12 are also Rank 1 since the Court lays out the applicable law in it which it will use in its decision. Sentences 13 and 14 though are Rank 2 as they simply paraphrase the text of the statute but nonetheless provide the tests to be applied. Sentences 15 to 28 are Rank 1 as the law is applied to the facts in these sentences. They also contain the interpretation of the factual matrix juxtaposed against the two clauses of Section 133, CrPC (applicable provision). Sentences 15 to 20 establish inapplicability of the first clause while sentences 21 to 27 deal with the second clause. These sentences together form the operative part of the decision.
62 | 
63 | _**User 4:**_ The assignment was to rank each sentence in the given judgment from 1 to 10 in descending order of importance. The rank was to be decided based on the need for the knowledge of a particular sentence in reaching the final judgment. The present judgment was an appeal under Section 133 of the Criminal Procedure Code, which provides for the procedure in urgent cases requiring the removal of a public nuisance. I used lower ranks like 5 and 6 to mark the facts of the case that were not material to the judgment but were required to get a full picture of the situation. These included the occupation of the appellants and their modus operandi (buying the vegetable in carts and parking the carts on a public road outside buildings). The tussle between the appellants and the Municipal board was left unmarked since it was immaterial in the appeal. Undisputed facts were also either left unmarked or marked low (Rank 5, 6). 
64 | Since it was a public nuisance case, the source of the nuisance, that is, the noise from the auction of vegetable and the carts on the road were considered material and were marked Rank 3.  Similarly, the question raised before the court was ranked 3. The judgment of the High Court, though necessary, has been marked 3. These are facts that the court should be privy to make a decision but not facts or statements that the decision would be based on. 
65 | Further, the court’s reasoning was marked the highest (rank 1 and 2). The legal observation that the court makes of the impugned judgment (statement 10) is important and therefore given rank 2. Even the court’s understanding and explanation of the legal provision is vital to decide the case. Thus, the sentences where the court notes that action under Section 133 can only be taken under certain circumstances and narrows down the focus on specific subparts of the section were ranked 2 too. A restatement of the clauses, however, was ranked 3. The analysis of the provision is more important for reaching the decision. The illustration connecting the section to the present case is also ranked 3, being an extension of the restatement and explanation of the sub-clauses. Accordingly, the court’s analysis of the provision in statement 19 was ranked 2 and its conclusion of noise not being an obstruction as defined in Section 133 was ranked 1. Similarly, the observations of fact in statements 21 through 24 are essential but not decisive and were marked 3. The obiter in statement 25 is a widely worded statement having no bearing of law or fact and has been left unmarked. Even the observation on the intention of the legislature in drafting this clause was left unmarked since legislative interpretation was not necessary to conclude the present case. The reason for acceptance of the appeal was that the noise produced was trivial and could not amount to a nuisance. Therefore, statements 27 and 28, stating this reasoning, have been ranked 1 for being of utmost importance.
66 | 
67 | _**User 5:**_ The main question in the case was not about what the law is, but whether the act of the appellants amounted to a violation of the statute. Therefore, the court’s conclusion that the appellants did not cause obstruction or nuisance is to be considered as the most important part of the judgement. For the same reason, the court’s finding that ‘slight discomfort’ will not justify an order like the one in the case clinches the case in favour of the appellant and gets the most value. 
68 | The concluding of facts was only possible by piecing together several smaller pieces of facts and reasoning by the judges. This part will be ranked second in terms of value, but their importance will vary according to the degree of proximity each finding will have to the final conclusion. For example, the finding that the block was not caused by the appellants, but by the people who bring carts will have more value than the part which states that different people might have come for the auction on different days. This is in spite of the fact that both ultimately helps the court to conclude that the obstruction was not caused by the appellants. 
69 | In the first clause, the term ‘unlawful obstruction’ is given more value than ‘nuisance’, as the court only tries to find whether or not the appellants caused any unlawful obstruction. Whether or not they caused any nuisance is not pondered upon by the court. For the same reason, the important term in the second clause is ‘injurious to the health or physical comfort’. 
70 | The part which simply iterates the plain words of the statute will have lesser significance, compared to the part which interprets the statute and explains what the statute actually means. So, the part where the court explains why the creators of the relevant law would not have intended to cover even ‘slight discomforts’ will have more value than the part which merely states what the exact words in the statute says. While the court did find that some noise was generated, that finding gets the least significance as the court’s finding that the noise was of low intensity trumps the previous argument.
71 | 


--------------------------------------------------------------------------------
/Data/DataScraping/Getting_links.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import requests
 3 | import re
 4 | from bs4 import BeautifulSoup
 5 | import time
 6 | import json
 7 | 
 8 | URL = 'https://indiankanoon.org/browse/' # url of site from where we are scraping
 9 | page = requests.get(URL) # sending a request to the url
10 | soup = BeautifulSoup(page.content, 'html.parser')
11 | 
12 | results = soup.find_all(class_ = 'browselist')
13 | results = list(results[0:1])
14 | 
15 | links = {}
16 | no_of_pages = list(range(100))
17 | base = 'https://indiankanoon.org'
18 | 
19 | for link in results: # loop for multiple courts, if scraping data from multiple courts
20 |     linkd = link.find('a')['href']
21 |     court_name = link.find('a').text
22 |     URL = base+linkd
23 |     page = requests.get(URL)
24 |     soup = BeautifulSoup(page.content, 'html.parser')
25 |     result_new = soup.find_all(class_ = 'browselist')
26 |     links[court_name] = []
27 |     for link_new in result_new: # loop for every year
28 |         print((link_new.find('a').text) + " Year Started .....\n")
29 |         if((int)(link_new.find('a').text) < 1947 or (int)(link_new.find('a').text) > 2020):
30 |             continue
31 |         URL = base + link_new.find('a')['href']
32 |         page = requests.get(URL)
33 |         soup = BeautifulSoup(page.content, 'html.parser')
34 |         result_new2 = soup.find_all(class_ = 'browselist')
35 |         for link_new2 in result_new2: # loop for every month
36 |             for page_in in no_of_pages: # loop for every page
37 |               time.sleep(1)
38 |               URL = base + link_new2.find('a')['href']
39 |               URL = URL + '&pagenum={}'.format(page_in)
40 |               page = requests.get(URL)
41 |               soup = BeautifulSoup(page.content, 'html.parser')
42 |               result_new3 = soup.find_all(class_ = 'result_url')
43 |               if(len(result_new3) == 0):
44 |                 break
45 |               for link_new3 in result_new3: # finally appending the url in the list
46 |                 URL = base + link_new3['href']
47 |                 links[court_name].append(URL)
48 |         print("Current Year Completed\n")
49 | 
50 | 
51 | valid_court_name = ['Supreme Court of India']
52 | 
53 | final_list = {}
54 | for court_name in valid_court_name:
55 |   final_list[court_name] = links[court_name]
56 | 
57 | 
58 | json_object = json.dumps(final_list, indent = 4) 
59 | 
60 | # saving the dictionary with links in a json file
61 | with open("links_Supreme_Court.json", "w") as outfile:  
62 |     outfile.write(json_object)
63 | 
64 | 


--------------------------------------------------------------------------------
/Data/DataScraping/README.md:
--------------------------------------------------------------------------------
 1 | ### Instructions
 2 | 
 3 | - First run the Getting_links.py file to get all the links of the cases( you need to change the output file path in the code according to you)
 4 | - After getting the file with all the links of the cases run the script_for_download_data_of_SCI.ipynb notebook file and use the links file as an input to it to extract all the text of all the cases( change the output file for the text according to you).
 5 | > **Note:**  If you are using some other site for scraping data, you need to change the url and some more things as the format of that site might not be same as Indian Kanoon.
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 |  
16 | 
17 | 


--------------------------------------------------------------------------------
/Data/DataScraping/script_for_download_data_of_SCI.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 0,
  6 |    "metadata": {
  7 |     "colab": {},
  8 |     "colab_type": "code",
  9 |     "id": "k9FyTv0k5DUo"
 10 |    },
 11 |    "outputs": [],
 12 |    "source": [
 13 |     "import os\n",
 14 |     "import urllib\n",
 15 |     "import urllib.request\n",
 16 |     "import json\n",
 17 |     "import requests\n",
 18 |     "from bs4 import BeautifulSoup\n",
 19 |     "import pandas as pd\n",
 20 |     "import time\n",
 21 |     "import progressbar"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "path_of_Supreme_court_json_file = \"links_Supreme_Court.json\" #path of Links of Supreme Court json file"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {
 37 |     "colab": {
 38 |      "base_uri": "https://localhost:8080/",
 39 |      "height": 1000
 40 |     },
 41 |     "colab_type": "code",
 42 |     "id": "nHO3gX5_5DU1",
 43 |     "outputId": "638f3888-e605-4195-fefe-76585fbc2938"
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "links_json = json.load(path_of_Supreme_court_json_file)\n",
 48 |     "category = 'Supreme Court of India'\n",
 49 |     "print(\"Started file ... {0} with docs = {1}\\n\\n\\n\".format(json_file,len(links_json[category])))\n",
 50 |     "category_links = links_json[category]\n",
 51 |     "year = json_file[len(json_file)-9:len(json_file)-5] # Obtain the name of year like '1947' or '2002'\n",
 52 |     "ndocs = 0\n",
 53 |     "os.mkdir('Yearwise_data/'+ year)\n",
 54 |     "\n",
 55 |     "while(len(category_links)>0):\n",
 56 |     "  time.sleep(2) \n",
 57 |     "  links_done_in_this_loop = []\n",
 58 |     "  for i in progressbar.progressbar(range(len(category_links))):\n",
 59 |     "      BASE_URL = category_links[i]\n",
 60 |     "      try:\n",
 61 |     "          html = urllib.request.urlopen(BASE_URL).read()\n",
 62 |     "      except urllib.error.HTTPError:\n",
 63 |     "          print(\"Ocuured at doc\",ndocs)\n",
 64 |     "          time.sleep(2)\n",
 65 |     "\n",
 66 |     "      else:\n",
 67 |     "          soup = BeautifulSoup(html, \"lxml\")\n",
 68 |     "          data_html = soup.find(\"div\", attrs={\"class\": \"judgments\"})\n",
 69 |     "          text = data_html.get_text()\n",
 70 |     "          path =  'Yearwise_data/' + year + '/' + year +'_' +str(ndocs) # Path where the yearwise data file will save\n",
 71 |     "          f= open(path,\"w+\")\n",
 72 |     "          f.write(text)\n",
 73 |     "          ndocs = ndocs+1\n",
 74 |     "          links_done_in_this_loop.append(BASE_URL)\n",
 75 |     "          if(ndocs%100==0):\n",
 76 |     "              time.sleep(2)          # If number of documents downloaded is 100 then go for sleep for 2 sec\n",
 77 |     "  for link in links_done_in_this_loop:\n",
 78 |     "      category_links.remove(link)\n",
 79 |     "  print(\"Docs that were downloaded: {0}\\n\\n\\n\".format(ndocs))"
 80 |    ]
 81 |   }
 82 |  ],
 83 |  "metadata": {
 84 |   "colab": {
 85 |    "name": "script_for_download_data_of_SCI.ipynb",
 86 |    "provenance": []
 87 |   },
 88 |   "kernelspec": {
 89 |    "display_name": "Python 3",
 90 |    "language": "python",
 91 |    "name": "python3"
 92 |   },
 93 |   "language_info": {
 94 |    "codemirror_mode": {
 95 |     "name": "ipython",
 96 |     "version": 3
 97 |    },
 98 |    "file_extension": ".py",
 99 |    "mimetype": "text/x-python",
100 |    "name": "python",
101 |    "nbconvert_exporter": "python",
102 |    "pygments_lexer": "ipython3",
103 |    "version": "3.7.7"
104 |   }
105 |  },
106 |  "nbformat": 4,
107 |  "nbformat_minor": 1
108 | }
109 | 


--------------------------------------------------------------------------------
/Data/ILDC_expert/ILDC_EXPERT_Convert_To_CSV.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "source": [
 20 |         "!unzip ILDC_expert.zip"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "LPfFxtLcmbZJ"
 24 |       },
 25 |       "execution_count": null,
 26 |       "outputs": []
 27 |     },
 28 |     {
 29 |       "cell_type": "code",
 30 |       "source": [
 31 |         "import os\n",
 32 |         "import json\n",
 33 |         "import numpy as np\n",
 34 |         "import pandas as pd"
 35 |       ],
 36 |       "metadata": {
 37 |         "id": "a2IO4Hobob0i"
 38 |       },
 39 |       "execution_count": 36,
 40 |       "outputs": []
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "source": [
 45 |         "path = \"ILDC_expert/annotation/\"\n",
 46 |         "output_path = \"ILDC_expert/annotation_in_csv/\" ## Give path where you need the converted json files in csv"
 47 |       ],
 48 |       "metadata": {
 49 |         "id": "soP4Srk8mgy_"
 50 |       },
 51 |       "execution_count": 51,
 52 |       "outputs": []
 53 |     },
 54 |     {
 55 |       "cell_type": "code",
 56 |       "source": [
 57 |         "key_to_label = {\"ACCEPTED\": \"ACCEPTED\", \n",
 58 |         "                \"Accepted\": \"ACCEPTED\",\n",
 59 |         "                \"REJECTED\": \"REJECTED\", \n",
 60 |         "                \"Rejected\": \"REJECTED\",\n",
 61 |         "                \"RANK1\": \"RANK 1\",\n",
 62 |         "                \"Rank1\": \"RANK 1\",\n",
 63 |         "                \"Rank2\": \"RANK 2\",\n",
 64 |         "                \"Rank3\": \"RANK 3\",\n",
 65 |         "                \"Rank4\": \"RANK 4\",\n",
 66 |         "                \"Rank5\": \"RANK 5\",\n",
 67 |         "                \"Rank6\": \"RANK 6\",\n",
 68 |         "                \"Rank7\": \"RANK 7\",\n",
 69 |         "                \"Rank8\": \"RANK 8\",\n",
 70 |         "                \"Rank9\": \"RANK 9\"}"
 71 |       ],
 72 |       "metadata": {
 73 |         "id": "KpQ-169zo7dp"
 74 |       },
 75 |       "execution_count": 32,
 76 |       "outputs": []
 77 |     },
 78 |     {
 79 |       "cell_type": "code",
 80 |       "source": [
 81 |         "if not os.path.exists(output_path):\n",
 82 |         "    os.mkdir(output_path)"
 83 |       ],
 84 |       "metadata": {
 85 |         "id": "l7QNmsKaydFe"
 86 |       },
 87 |       "execution_count": 47,
 88 |       "outputs": []
 89 |     },
 90 |     {
 91 |       "cell_type": "code",
 92 |       "source": [
 93 |         "for case in os.listdir(path):\n",
 94 |         "    curr_path = os.path.join(path, case)\n",
 95 |         "    if not os.path.exists(os.path.join(output_path, case)):\n",
 96 |         "        os.mkdir(os.path.join(output_path, case))\n",
 97 |         "    for user in os.listdir(curr_path):\n",
 98 |         "        try:\n",
 99 |         "            with open(os.path.join(curr_path, user),\"r\") as f:\n",
100 |         "                data = json.load(f)\n",
101 |         "                f.close()\n",
102 |         "            for key in data['_referenced_fss'].keys():\n",
103 |         "                case_text = data['_referenced_fss'][key]['sofaString']\n",
104 |         "            rows = []\n",
105 |         "            for key in list(data['_views']['_InitialView'].keys()):\n",
106 |         "                if key in key_to_label.keys():\n",
107 |         "                    for sent in data['_views']['_InitialView'][key]:\n",
108 |         "                        curr = {}             \n",
109 |         "                        curr[\"TEXT\"] = case_text[sent['begin']: sent['end']]\n",
110 |         "                        curr[\"LABEL\"] = key_to_label[key]\n",
111 |         "                        rows.append(curr)\n",
112 |         "            df = pd.DataFrame(rows)\n",
113 |         "            df.to_csv(os.path.join(os.path.join(output_path, case), user[:-4]+\".csv\"))\n",
114 |         "        except:\n",
115 |         "            continue"
116 |       ],
117 |       "metadata": {
118 |         "id": "SdneiWwLoeBh"
119 |       },
120 |       "execution_count": 50,
121 |       "outputs": []
122 |     }
123 |   ]
124 | }


--------------------------------------------------------------------------------
/Data/ILDC_expert/README.md:
--------------------------------------------------------------------------------
1 | ### Description of the Dataset ###
2 | The **ILDC_expert** have two repositories, source and annotation. The source repository contains the preprocessed documents (56 in total), which was given to the 5 different annotators. The annotation repository also contains the 56 document folders but here each folder has 5 annotated json files corresponding to every user.
3 | 
4 | Use the `ILDC_EXPERT_Convert_To_CSV` file to convert the json files for each user and each case to csv files. The csv file contain 2 columns a `TEXT` and a `LABEL` column. `TEXT` is a section of the case and `LABEL` is it's rank. 
5 | 


--------------------------------------------------------------------------------
/Data/ILDC_multi/README.md:
--------------------------------------------------------------------------------
 1 | ## Link of the Dataset ##
 2 | The **ILDC_multi** dataset (34816) including train (32305), development (994) and test (1517). 
 3 | 
 4 | ### Description of the Dataset ###
 5 | 
 6 | The link contains the CSV file of **ILDC_multi** dataset, which has four columns ['text', 'label', 'split', 'name']. 
 7 | Where,
 8 | * 'text' contains the preprocessed data
 9 | * 'label' contains either '0' or '1'
10 |   * '0' represents all petitions have been rejected
11 |   * '1' represents atleast one petition has been accepted
12 | * 'split' maintains that the file belongs either train set, validation set or test set
13 | * 'name' shows that the name of file.
14 | 
15 | ### Example of Train set ###
16 | 
17 | text                                              | label | split | name
18 | ------------------------------------------------- | ----- | ----- | ----
19 | Uday Umesh Lalit, J. These appeals arise out ...  | 0     | train | 2020_1.txt
20 | Indira Banerjee, J. These appeals are against...	| 0	    | train	| 2020_2.txt
21 | M. Khanwilkar, J. Delay companydoned. Leave g...	| 1     |	train	| 2020_12.txt
22 | 
23 | ### Example of Validation/Development set ###
24 | 
25 | text                                              | label | split | name
26 | ------------------------------------------------- | ----- | ----- | ----
27 | civil appellate jurisdiction civil appeal numb...	| 0	    | dev	  | 1989_75.txt
28 | original jurisdiction writ petitions number. 8...	| 0	    | dev	  | 1985_233.txt
29 | \nsarkar j. \n\nwe think this appeal must be a...	| 1	    | dev	  | 1963_285.txt
30 | 
31 | 
32 | ### Example of Test set ###
33 | 
34 | text                                              | label | split | name
35 | ------------------------------------------------- | ----- | ----- | ----
36 | civil appellate jurisdiction civil appeal numb...	| 1	    | test	 | 1986_397.txt
37 | criminal appellate jurisdiction special leave ...	| 0	    | test	 | 1977_183.txt
38 | criminal appellate jurisdiction criminal appea...	| 0	    | test	  | 1993_98.txt
39 | 
40 | 


--------------------------------------------------------------------------------
/Data/ILDC_single/README.md:
--------------------------------------------------------------------------------
 1 | ## Link of the Dataset ##
 2 | The **ILDC_single** dataset (7593) including train (5082), development (994) and test (1517). 
 3 | 
 4 | ### Description of the Dataset ###
 5 | 
 6 | The link contains the CSV file of **ILDC_single** dataset, which has four columns ['text', 'label', 'split', 'name']. 
 7 | Where,
 8 | * 'text' contains the preprocessed data
 9 | * 'label' contains either '0' or '1'
10 |   * '0' represents all petitions have been rejected
11 |   * '1' represents all petitions have been accepted
12 | * 'split' maintains that the file belongs either train set, validation set or test set
13 | * 'name' shows that the name of file.
14 | 
15 | ### Example of Train set ###
16 | 
17 | text                                              | label | split | name
18 | ------------------------------------------------- | ----- | ----- | ----
19 | F. NARIMAN, J. Leave granted. In 2008, the Pu...	| 1     |	train |	2019_890.txt
20 | S. THAKUR, J. Leave granted. These appeals ar...	| 0	    |train  |	2014_170.txt
21 | Markandey Katju, J. Leave granted. Heard lear...	| 1	    |train	| 2010_721.txt
22 | 
23 | *The Validation/Development and Test sets are same as **ILDC_multi**.*
24 | 
25 | ### Example of Validation/Development set ###
26 | 
27 | text                                              | label | split | name
28 | ------------------------------------------------- | ----- | ----- | ----
29 | civil appellate jurisdiction civil appeal numb...	| 0	    | dev	  | 1989_75.txt
30 | original jurisdiction writ petitions number. 8...	| 0	    | dev	  | 1985_233.txt
31 | \nsarkar j. \n\nwe think this appeal must be a...	| 1	    | dev	  | 1963_285.txt
32 | 
33 | 
34 | ### Example of Test set ###
35 | 
36 | text                                              | label | split | name
37 | ------------------------------------------------- | ----- | ----- | ----
38 | civil appellate jurisdiction civil appeal numb...	| 1	    | test	 | 1986_397.txt
39 | criminal appellate jurisdiction special leave ...	| 0	    | test	 | 1977_183.txt
40 | criminal appellate jurisdiction criminal appea...	| 0	    | test	 | 1993_98.txt
41 | 
42 | 


--------------------------------------------------------------------------------
/Data/Preprocessing/label_generation_multi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[ ]:
  5 | 
  6 | 
  7 | import os
  8 | import re
  9 | import spacy
 10 | import random
 11 | import nltk
 12 | from nltk.stem import PorterStemmer 
 13 | from nltk.tokenize import word_tokenize 
 14 | import nltk.data
 15 | import time
 16 | import sys
 17 | import progressbar
 18 | import random
 19 | import shutil
 20 | import progressbar
 21 | import json
 22 | 
 23 | nltk.download('punkt')
 24 | tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 25 | 
 26 | input_paths_of_files = "gdrive/My Drive/data/" # give path to the directory where all the case files are contained
 27 | files = os.listdir(input_paths_of_files)
 28 | 
 29 | data = {}
 30 | data['data']={}
 31 | 
 32 | def label_for_file(sentence_set): # an implementation to find the label of each case, assumption that every case has words like accepted,rejected etc. at the end of the case
 33 |   label = -1
 34 |   sentnum = -1
 35 |   for i in range(len(sentence_set)):# loop through all sentences in which we will check for the words like asscepted, rejected etc.
 36 |           sentence = sentence_set[i]
 37 |           sentence = sentence.lower()
 38 |           A_accepted = ("accepted" in sentence) or ("accept" in sentence) or ("accepting" in sentence)# for accepeted cases we check words like accepted, approved, permitted and their variations
 39 |           A_allowed = ("allowed" in sentence) or ("allow" in sentence) or ("allowing" in sentence)
 40 |           A_permitted = ("permitted" in sentence) or ("permit" in sentence) or ("permitting" in sentence)
 41 |           A = A_accepted or A_allowed or A_permitted
 42 |           R_rejected = ("rejected" in sentence) or ("reject" in sentence) or ("rejecting" in sentence)# for rejected cases we check words like rejected, disposed, dismissed and their variations
 43 |           R_disposed = ("disposed" in sentence) or ("dispose" in sentence) or ("disposing" in sentence)
 44 |           R_dismissed = ("dismissed" in sentence) or ("dismiss" in sentence) or ("dismissing" in sentence)
 45 |           R = R_rejected or R_dismissed or R_disposed
 46 |           Appeal = ("appeal" in sentence) or ("appeals" in sentence)# we check if any sentence has accepted type and appeal together in a sentence and similarly for rejected type
 47 |           App = Appeal
 48 |           Not = ("not" in sentence) or ("no" in sentence)
 49 |           if(App and A and R and Not):
 50 |               break
 51 |           if(App and A and R):
 52 |               break
 53 |           if(App and A):
 54 |               label = 1
 55 |               sentnum = i
 56 |               break
 57 |           if(App and R):
 58 |               label = 0
 59 |               sentnum = i
 60 |               break
 61 |           if(App and A and Not):
 62 |               break
 63 |           if(App and R and Not):
 64 |               break    
 65 | 
 66 |   return label, sentnum
 67 | 
 68 | for i in progressbar.progressbar(range(len(dirs))):
 69 |     file_path = dirs[i]
 70 |     f = open(file_path, "r")
 71 |     text = f.read()
 72 |     
 73 |     sentences = tokenizer.tokenize(text)
 74 |     sentence_set = sentences[max(0,len(sentences)-100):]# we consider last 100 sentences where we check for label
 75 |     sentence_set.reverse()# reverse the set of sentences
 76 |     
 77 |     label, sentnum = label_for_file(sentence_set)
 78 | 
 79 |     if(label==-1):# if no label is found skip case
 80 |       continue
 81 | 
 82 |     cut_sentences = sentences[:len(sentences)-sentnum-1]# after lable is found, cut the end part of text where label was found
 83 |     new_sentence_set = sentences[max(0,len(cut_sentences)-10):]
 84 |     new_sentence_set.reverse()
 85 | 
 86 |     new_label, new_sentnum = label_for_file(new_sentence_set)
 87 | 
 88 |     if(new_label!=label):
 89 |       new_text = (" ").join(cut_sentences)
 90 |       final_text = new_text
 91 | 
 92 |       if(label==1):# label 1 for accepted
 93 |         p_file = open(like_orig_path + "Accepted/" + file_path[26:] + ".txt" ,"w")
 94 |         p_file.write(final_text) # write the final text in a file
 95 |         tokens = word_tokenize(final_text)
 96 |         num_tokens = len(tokens)
 97 |         data['data'][file_path[26:]] = {}
 98 |         data['data'][file_path[26:]]['name'] = file_path[26:] # the name key contains the name of file
 99 |         data['data'][file_path[26:]]['tokens'] = num_tokens# the tokens key contains number of tokens of file
100 |         data['data'][file_path[26:]]['sentences'] = len(cut_sentences)# the sentences key contains the length of all sentences after removing the part which helped in label finding 
101 |         data['data'][file_path[26:]]['label'] = 'Accepted'# the label key contains the label of case
102 |         data['data'][file_path[26:]]['proof_sentence'] = sentence_set[sentnum] # the proof_sentence key contains the sentennce which helped in label finding
103 |         p_file.close()
104 |         lines = final_text.split("\n")
105 |         text_new = " ".join(lines)
106 |         text_new = re.sub(" +"," ",text_new)
107 |         CT_file = open(summ_feed + "Accepted/" + file_path[26:] + ".txt" ,"w")
108 |         CT_file.write(text_new)
109 |         CT_file.close()
110 | 
111 |       elif(label==0):# label 0 for rejected
112 |         p_file = open(like_orig_path + "Rejected/" + file_path[26:] + ".txt" ,"w")
113 |         p_file.write(final_text)
114 |         tokens = word_tokenize(final_text)
115 |         num_tokens = len(tokens)
116 |         data['data'][file_path[26:]] = {}
117 |         data['data'][file_path[26:]]['name'] = file_path[26:]
118 |         data['data'][file_path[26:]]['tokens'] = num_tokens
119 |         data['data'][file_path[26:]]['sentences'] = len(cut_sentences)
120 |         data['data'][file_path[26:]]['label'] = 'Rejected'
121 |         data['data'][file_path[26:]]['proof_sentence'] = sentence_set[sentnum] 
122 |         p_file.close()
123 |         lines = final_text.split("\n")
124 |         text_new = " ".join(lines)
125 |         text_new = re.sub(" +"," ",text_new)
126 |         CT_file = open(summ_feed + "Rejected/" + file_path[26:] + ".txt" ,"w")
127 |         CT_file.write(text_new)
128 |         CT_file.close()
129 | 
130 |     else:
131 |       new_cut_sentences = cut_sentences[:len(cut_sentences)-new_sentnum-1]
132 |       new_text = (" ").join(new_cut_sentences)
133 |       final_text = new_text
134 | 
135 |       if(label==1):
136 |         p_file = open(like_orig_path + "Accepted/" + file_path[26:] + ".txt" ,"w")
137 |         p_file.write(final_text)
138 |         tokens = word_tokenize(final_text)
139 |         num_tokens = len(tokens)
140 |         data['data'][file_path[26:]] = {}
141 |         data['data'][file_path[26:]]['name'] = file_path[26:]
142 |         data['data'][file_path[26:]]['tokens'] = num_tokens
143 |         data['data'][file_path[26:]]['sentences'] = len(new_cut_sentences)
144 |         data['data'][file_path[26:]]['label'] = 'Accepted'
145 |         data['data'][file_path[26:]]['proof_sentence'] = sentence_set[sentnum] 
146 |         p_file.close()
147 |         lines = final_text.split("\n")
148 |         text_new = " ".join(lines)
149 |         text_new = re.sub(" +"," ",text_new)
150 |         CT_file = open(summ_feed + "Accepted/" + file_path[26:] + ".txt" ,"w")
151 |         CT_file.write(text_new)
152 |         CT_file.close()
153 | 
154 |       elif(label==0):
155 |         p_file = open(like_orig_path + "Rejected/" + file_path[26:] + ".txt" ,"w")
156 |         p_file.write(final_text)
157 |         tokens = word_tokenize(final_text)
158 |         num_tokens = len(tokens)
159 |         data['data'][file_path[26:]] = {}
160 |         data['data'][file_path[26:]]['name'] = file_path[26:]
161 |         data['data'][file_path[26:]]['tokens'] = num_tokens
162 |         data['data'][file_path[26:]]['sentences'] = len(new_cut_sentences)
163 |         data['data'][file_path[26:]]['label'] = 'Rejected'
164 |         data['data'][file_path[26:]]['proof_sentence'] = sentence_set[sentnum] 
165 |         p_file.close()
166 |         lines = final_text.split("\n")
167 |         text_new = " ".join(lines)
168 |         text_new = re.sub(" +"," ",text_new)
169 |         CT_file = open(summ_feed + "Rejected/" + file_path[26:] + ".txt" ,"w")
170 |         CT_file.write(text_new)
171 |         CT_file.close()
172 | 
173 | json_file = open(js_path + "splitter.json", "w")# now save the 'data' dictonary  in a json file
174 | json.dump(file_info, json_file)
175 | 
176 | 


--------------------------------------------------------------------------------
/Data/Preprocessing/label_generation_single.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[2]:
  5 | 
  6 | 
  7 | import os
  8 | import re
  9 | import spacy
 10 | import random
 11 | import nltk
 12 | from nltk.stem import PorterStemmer 
 13 | from nltk.tokenize import word_tokenize 
 14 | import nltk.data
 15 | import time
 16 | import sys
 17 | import progressbar
 18 | import random
 19 | import shutil
 20 | import progressbar
 21 | import json
 22 | 
 23 | tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 24 | 
 25 | input_paths_of_files = "gdrive/My Drive/data/" # give path to the directory where all the case files are contained
 26 | files = os.listdir(input_paths_of_files)
 27 | 
 28 | corpus = []
 29 | label = []
 30 | filen = []
 31 | 
 32 | ctAAcce = [] #"appeal accepted."
 33 | ctADisp = [] #"appeal disposed."
 34 | ctAAllo = [] #"appeal allowed."
 35 | ctADism = [] #"appeal dismissed."
 36 | ctAReje = [] #"appeal rejected."
 37 | ctPAcce = [] #"appeal accepted."
 38 | ctPDisp = [] #"appeal disposed."
 39 | ctPAllo = [] #"appeal allowed."
 40 | ctPDism = [] #"appeal dismissed."
 41 | ctPReje = [] #"appeal rejected."
 42 | ctAsAcce = [] #"appeal accepted."
 43 | ctAsDisp = [] #"appeal disposed."
 44 | ctAsAllo = [] #"appeal allowed."
 45 | ctAsDism = [] #"appeal dismissed."
 46 | ctAsReje = [] #"appeal rejected."
 47 | ctPsAcce = [] #"appeal accepted."
 48 | ctPsDisp = [] #"appeal disposed."
 49 | ctPsAllo = [] #"appeal allowed."
 50 | ctPsDism = [] #"appeal dismissed."
 51 | ctPsReje = [] #"appeal rejected."
 52 | 
 53 | 
 54 | for i in progressbar.progressbar(range(len(files))):# loop through all cases
 55 |     
 56 |     file_path = os.path.join(input_paths_of_files, files[i])
 57 |     f = open(files[i], "r", encoding="latin1")
 58 |     text = f.read()
 59 |     text = text.lower()
 60 |     textf = text[len(text)-100:]# will check if a particular text is present or not in last 100 characters for label finding
 61 |     
 62 |     if(re.search("appeal accepted\.",textf)!=None):# check if "appeal accepted" is there or not
 63 |         text = text[:len(text)-len(textf)+ re.search("appeal accepted\.",textf).span()[0]]
 64 |         sents = tokenizer.tokenize(text)
 65 |         sents = sents[:len(sents)-4]
 66 |         text = (" ").join(sents)
 67 |         
 68 |         corpus.append(text)
 69 |         label.append(1)
 70 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
 71 |         ctAAcce.append(file_path)
 72 |         continue
 73 |     if(re.search("appeal rejected\.",textf)!=None):
 74 |         text = text[:len(text)-len(textf)+ re.search("appeal rejected\.",textf).span()[0]]
 75 |         sents = tokenizer.tokenize(text)
 76 |         sents = sents[:len(sents)-4]
 77 |         text = (" ").join(sents)
 78 |         
 79 |         corpus.append(text)
 80 |         label.append(0)
 81 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
 82 |         ctAReje.append(file_path)
 83 |         continue
 84 |     if(re.search("appeal disposed\.",textf)!=None):
 85 |         text = text[:len(text)-len(textf)+ re.search("appeal disposed\.",textf).span()[0]]
 86 |         sents = tokenizer.tokenize(text)
 87 |         sents = sents[:len(sents)-4]
 88 |         text = (" ").join(sents)
 89 |         
 90 |         corpus.append(text)
 91 |         label.append(0)
 92 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
 93 |         ctADisp.append(file_path)
 94 |         continue
 95 |     if(re.search("appeal dismissed\.",textf)!=None):
 96 |         text = text[:len(text)-len(textf)+ re.search("appeal dismissed\.",textf).span()[0]]
 97 |         sents = tokenizer.tokenize(text)
 98 |         sents = sents[:len(sents)-4]
 99 |         text = (" ").join(sents)
100 |         
101 |         corpus.append(text)
102 |         label.append(0)
103 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
104 |         ctADism.append(file_path)
105 |         continue
106 |     if(re.search("appeal allowed\.",textf)!=None):
107 |         text = text[:len(text)-len(textf)+ re.search("appeal allowed\.",textf).span()[0]]
108 |         sents = tokenizer.tokenize(text)
109 |         sents = sents[:len(sents)-4]
110 |         text = (" ").join(sents)
111 |         
112 |         corpus.append(text)
113 |         label.append(1)
114 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
115 |         ctAAllo.append(file_path)
116 |         continue
117 |     if(re.search("petition accepted\.",textf)!=None):
118 |         text = text[:len(text)-len(textf)+ re.search("petition accepted\.",textf).span()[0]]
119 |         sents = tokenizer.tokenize(text)
120 |         sents = sents[:len(sents)-4]
121 |         text = (" ").join(sents)
122 |         
123 |         corpus.append(text)
124 |         label.append(1)
125 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
126 |         ctPAcce.append(file_path)
127 |         continue
128 |     if(re.search("petition rejected\.",textf)!=None):
129 |         text = text[:len(text)-len(textf)+ re.search("petition rejected\.",textf).span()[0]]
130 |         sents = tokenizer.tokenize(text)
131 |         sents = sents[:len(sents)-4]
132 |         text = (" ").join(sents)
133 |         
134 |         corpus.append(text)
135 |         label.append(0)
136 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
137 |         ctPReje.append(file_path)
138 |         continue
139 |     if(re.search("petition disposed\.",textf)!=None):
140 |         text = text[:len(text)-len(textf)+ re.search("petition disposed\.",textf).span()[0]]
141 |         sents = tokenizer.tokenize(text)
142 |         sents = sents[:len(sents)-4]
143 |         text = (" ").join(sents)
144 |         
145 |         corpus.append(text)
146 |         label.append(0)
147 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
148 |         ctPDisp.append(file_path)
149 |         continue
150 |     if(re.search("petition dismissed\.",textf)!=None):
151 |         text = text[:len(text)-len(textf)+ re.search("petition dismissed\.",textf).span()[0]]
152 |         sents = tokenizer.tokenize(text)
153 |         sents = sents[:len(sents)-4]
154 |         text = (" ").join(sents)
155 |         
156 |         corpus.append(text)
157 |         label.append(0)
158 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
159 |         ctPDism.append(file_path)
160 |         continue
161 |     if(re.search("petition allowed\.",textf)!=None):
162 |         text = text[:len(text)-len(textf)+ re.search("petition allowed\.",textf).span()[0]]
163 |         sents = tokenizer.tokenize(text)
164 |         sents = sents[:len(sents)-4]
165 |         text = (" ").join(sents)
166 |         
167 |         corpus.append(text)
168 |         label.append(1)
169 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
170 |         ctPAllo.append(file_path)
171 |         continue
172 |     if(re.search("appeals accepted\.",textf)!=None):
173 |         text = text[:len(text)-len(textf)+ re.search("appeals accepted\.",textf).span()[0]]
174 |         sents = tokenizer.tokenize(text)
175 |         sents = sents[:len(sents)-4]
176 |         text = (" ").join(sents)
177 |         
178 |         corpus.append(text)
179 |         label.append(1)
180 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
181 |         ctAsAcce.append(file_path)
182 |         continue
183 |     if(re.search("appeals rejected\.",textf)!=None):
184 |         text = text[:len(text)-len(textf)+ re.search("appeals rejected\.",textf).span()[0]]
185 |         sents = tokenizer.tokenize(text)
186 |         sents = sents[:len(sents)-4]
187 |         text = (" ").join(sents)
188 |         
189 |         corpus.append(text)
190 |         label.append(0)
191 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
192 |         ctAsReje.append(file_path)
193 |         continue
194 |     if(re.search("appeals disposed\.",textf)!=None):
195 |         text = text[:len(text)-len(textf)+ re.search("appeals disposed\.",textf).span()[0]]
196 |         sents = tokenizer.tokenize(text)
197 |         sents = sents[:len(sents)-4]
198 |         text = (" ").join(sents)
199 |         
200 |         corpus.append(text)
201 |         label.append(0)
202 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
203 |         ctAsDisp.append(file_path)
204 |         continue
205 |     if(re.search("appeals dismissed\.",textf)!=None):
206 |         text = text[:len(text)-len(textf)+ re.search("appeals dismissed\.",textf).span()[0]]
207 |         sents = tokenizer.tokenize(text)
208 |         sents = sents[:len(sents)-4]
209 |         text = (" ").join(sents)
210 |         
211 |         corpus.append(text)
212 |         label.append(0)
213 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
214 |         ctAsDism.append(file_path)
215 |         continue
216 |     if(re.search("appeals allowed\.",textf)!=None):
217 |         text = text[:len(text)-len(textf)+ re.search("appeals allowed\.",textf).span()[0]]
218 |         sents = tokenizer.tokenize(text)
219 |         sents = sents[:len(sents)-4]
220 |         text = (" ").join(sents)
221 |         
222 |         corpus.append(text)
223 |         label.append(1)
224 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
225 |         ctAsAllo.append(file_path)
226 |         continue
227 |     if(re.search("petitions accepted\.",textf)!=None):
228 |         text = text[:len(text)-len(textf)+ re.search("petitions accepted\.",textf).span()[0]]
229 |         sents = tokenizer.tokenize(text)
230 |         sents = sents[:len(sents)-4]
231 |         text = (" ").join(sents)
232 |         
233 |         corpus.append(text)
234 |         label.append(1)
235 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
236 |         ctPsAcce.append(file_path)
237 |         continue
238 |     if(re.search("petitions rejected\.",textf)!=None):
239 |         text = text[:len(text)-len(textf)+ re.search("petitions rejected\.",textf).span()[0]]
240 |         sents = tokenizer.tokenize(text)
241 |         sents = sents[:len(sents)-4]
242 |         text = (" ").join(sents)
243 |         
244 |         corpus.append(text)
245 |         label.append(0)
246 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
247 |         ctPsReje.append(file_path)
248 |         continue
249 |     if(re.search("petitions disposed\.",textf)!=None):
250 |         text = text[:len(text)-len(textf)+ re.search("petitions disposed\.",textf).span()[0]]
251 |         sents = tokenizer.tokenize(text)
252 |         sents = sents[:len(sents)-4]
253 |         text = (" ").join(sents)
254 |         
255 |         corpus.append(text)
256 |         label.append(0)
257 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
258 |         ctPsDisp.append(file_path)
259 |         continue
260 |     if(re.search("petitions dismissed\.",textf)!=None):
261 |         text = text[:len(text)-len(textf)+ re.search("petitions dismissed\.",textf).span()[0]]
262 |         sents = tokenizer.tokenize(text)
263 |         sents = sents[:len(sents)-4]
264 |         text = (" ").join(sents)
265 |         
266 |         corpus.append(text)
267 |         label.append(0)
268 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
269 |         ctPsDism.append(file_path)
270 |         continue
271 |     if(re.search("petitions allowed\.",textf)!=None):
272 |         text = text[:len(text)-len(textf)+ re.search("petitions allowed\.",textf).span()[0]]
273 |         sents = tokenizer.tokenize(text)
274 |         sents = sents[:len(sents)-4]
275 |         text = (" ").join(sents)
276 |         
277 |         corpus.append(text)
278 |         label.append(1)
279 |         filen.append(file_path[file_path.rfind("/")+1:] + ".txt")
280 |         ctPsAllo.append(file_path)
281 |     
282 | 
283 | Acc_files = ctAAcce + ctAAllo + ctPAllo + ctPAcce
284 | Rej_files = ctAReje + ctADism + ctADisp + ctPReje + ctPDism + ctPDisp
285 | 
286 | Acc_files = [f[f.rfind("/")+1:]+ ".txt" for f in Acc_files]
287 | Rej_files = [f[f.rfind("/")+1:]+ ".txt" for f in Rej_files]
288 | 
289 | f = open("/home/vijit/miniset_files/anno_acc_list.txt", "w+")# writing all the accepted list files in a directory, give the path according to you
290 | for fi in Acc_files:
291 |     f.write(fi + "\n")
292 |     
293 | f.close()
294 | 
295 | f = open("/home/vijit/miniset_files/anno_rej_list.txt", "w+") # writing all the rejected list files in a directory, give the path according to you
296 | for fi in Acc_files:
297 | for fi in Rej_files:
298 |     f.write(fi + "\n")
299 | f.close()
300 | 
301 | 


--------------------------------------------------------------------------------
/Data/Preprocessing/preprocess.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import spacy
 4 | import random
 5 | import nltk
 6 | from nltk.stem import PorterStemmer 
 7 | from nltk.tokenize import word_tokenize 
 8 | import nltk.data
 9 | import time
10 | import sys
11 | import progressbar
12 | import random
13 | import shutil
14 | import progressbar
15 | import json
16 | 
17 | nltk.download('punkt')
18 | tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
19 | 
20 | path_dataset_dir = "SCI/full_data/" # give the path of directory where the dataset is saved
21 | files = os.listdir(path_dataset_dir) # all the files in the dataset
22 | 
23 | 
24 | num_files_till_now = 0 # counter to count how many files have been preprocessed at a certain instance
25 | 
26 | 
27 | for i in progressbar.progressbar(range(len(files))):
28 |     file_path = os.path.join(path_dataset_dir, files[i])
29 |     f = open(file_path, "r")
30 |     text = f.read()
31 |     text = re.sub(r"\xa0"," ",text)
32 |     text = text.split("\n") # splitting using new line character
33 |     text = [re.sub(r'[^a-zA-Z0-9.,)\-(/?\t ]','',sentence) for sentence in text] # removing everything other than these a-zA-Z0-9.,)\-(/?\t
34 |     text = [re.sub(r'(?<=[^0-9])/(?=[^0-9])',' ',sentence) for sentence in text]
35 |     text = [re.sub("\t+"," ",sentence) for sentence in text] # converting multiple tabs and spaces ito a single tab or space
36 |     text = [re.sub(" +"," ",sentence) for sentence in text]
37 |     text = [re.sub("\.\.+","",sentence) for sentence in text]# these were the commmon noises in out data, depends on data
38 |     text = [re.sub("\A ?","",sentence) for sentence in text]
39 |     text = [sentence for sentence in text if(len(sentence) != 1 and not re.fullmatch("(\d|\d\d|\d\d\d)",sentence))]
40 |     text = [re.sub('\A\(?(\d|\d\d\d|\d\d|[a-zA-Z])(\.|\))\s?(?=[A-Z])','\n',sentence) for sentence in text]#dividing into para wrt to points
41 |     text = [re.sub("\A\(([ivx]+)\)\s?(?=[a-zA-Z0-9])",'\n',sentence) for sentence in text] #dividing into para wrt to roman points
42 |     text = re.sub(r"[()[\]\"$']"," ",text) # removing ()[\]\"$' these characters
43 |     text = re.sub(r" no."," number",text) # converting no., nos., co., ltd.  to number, numbers, company and limited
44 |     text = re.sub(r" nos."," numbers",text)
45 |     text = re.sub(r" co."," company",text)
46 |     text = re.sub(r" ltd."," limited",text)
47 |     text2 = []
48 |     for index in range(len(text)):#for removing multiple new-lines
49 |         if(index>0 and text[index]=='' and text[index-1]==''):
50 |             continue
51 |         if(index<len(text)-1 and text[index+1]!='' and text[index+1][0]=='\n' and text[index]==''):
52 |             continue
53 |         text2.append(text[index])
54 |     text = text2
55 |     for i in range(len(text)): # ignoring the text before JUDGMENT,ORDER......
56 |        if(re.search("\A(ORDER|JUDGMENT|J U D G M E N T|O R D E R)",text[i])):
57 |            break
58 |     if(i == len(text)-1):
59 |        continue
60 |     if(re.search("\A(ORDER|JUDGMENT|J U D G M E N T|O R D E R)",text[i+1])):
61 |        i = i+1
62 |     text = text[i+1:]
63 |     text = "\n".join(text)
64 |     lines = text.split("\n")
65 |     text_new = " ".join(lines) # joining all the lines into a single text
66 |     text_new = re.sub(" +"," ",text_new)
67 |     num_tokens = len(word_tokenize(text_new))
68 |     num_files_till_now+=1
69 |     if(num_files_till_now%1000 == 0):
70 |         print("Number of files that have been preprocessed: {0}".format(num_files_till_now))
71 |     if(num_tokens < 100): # if number of tokens in file after preprocessing is less than 100 skipping the file
72 |         continue
73 |     
74 |     fff = open("SCI/Preprocessed_files/" + files[i] + ".txt" ,"w+") #finally writing the preprocessed file in utput directory
75 |     fff.write(text_new)
76 |     fff.close()
77 | 


--------------------------------------------------------------------------------
/Data/README.md:
--------------------------------------------------------------------------------
 1 | # ILDC (Indian Legal Documents Corpus)
 2 | 
 3 | The dataset and leaderboard are available on [Hugging Face](https://huggingface.co/spaces/Exploration-Lab/IL-TUR-Leaderboard) 
 4 | 
 5 | 
 6 | Please note that data is free to use for academic research and commercial usage of data is not allowed. 
 7 | 
 8 | 
 9 | ## ILDC dataset statistics
10 | 
11 | ![statistics](images/stats.png "statistics")
12 | 
13 | 


--------------------------------------------------------------------------------
/Data/images/stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Data/images/stats.png


--------------------------------------------------------------------------------
/Models/CNN/CNN__final.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "CNN _final.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "display_name": "Python 3",
 12 |       "language": "python",
 13 |       "name": "python3"
 14 |     },
 15 |     "language_info": {
 16 |       "codemirror_mode": {
 17 |         "name": "ipython",
 18 |         "version": 3
 19 |       },
 20 |       "file_extension": ".py",
 21 |       "mimetype": "text/x-python",
 22 |       "name": "python",
 23 |       "nbconvert_exporter": "python",
 24 |       "pygments_lexer": "ipython3",
 25 |       "version": "3.7.3"
 26 |     }
 27 |   },
 28 |   "cells": [
 29 |     {
 30 |       "cell_type": "code",
 31 |       "metadata": {
 32 |         "colab_type": "code",
 33 |         "id": "0LeFv-AvMWDb",
 34 |         "colab": {
 35 |           "base_uri": "https://localhost:8080/",
 36 |           "height": 34
 37 |         },
 38 |         "outputId": "1d8bb93c-ec58-442f-91a2-dee32706b6c6"
 39 |       },
 40 |       "source": [
 41 |         "import numpy as np\n",
 42 |         "from numpy import load\n",
 43 |         "import pandas as pd\n",
 44 |         "import pickle\n",
 45 |         "from collections import defaultdict\n",
 46 |         "import re\n",
 47 |         "from bs4 import BeautifulSoup\n",
 48 |         "import sys\n",
 49 |         "import os\n",
 50 |         "os.environ['KERAS_BACKEND']='theano' # Using theano as backend instead of tensorflow\n",
 51 |         "from keras.preprocessing.text import Tokenizer\n",
 52 |         "from keras.preprocessing.sequence import pad_sequences\n",
 53 |         "from keras.utils.np_utils import to_categorical\n",
 54 |         "from keras.layers import Embedding\n",
 55 |         "from keras.layers import Dense, Input, Flatten\n",
 56 |         "from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Input\n",
 57 |         "from keras.models import Model\n",
 58 |         "from keras.callbacks import ModelCheckpoint\n",
 59 |         "from keras.callbacks import ReduceLROnPlateau\n",
 60 |         "import matplotlib.pyplot as plt\n",
 61 |         "from keras.models import load_model\n",
 62 |         "from sklearn.metrics import accuracy_score\n",
 63 |         "from sklearn.metrics import confusion_matrix\n",
 64 |         "\n",
 65 |         "plt.switch_backend('agg')\n",
 66 |         "%matplotlib inline"
 67 |       ],
 68 |       "execution_count": null,
 69 |       "outputs": [
 70 |         {
 71 |           "output_type": "stream",
 72 |           "text": [
 73 |             "Using Theano backend.\n"
 74 |           ],
 75 |           "name": "stderr"
 76 |         }
 77 |       ]
 78 |     },
 79 |     {
 80 |       "cell_type": "code",
 81 |       "metadata": {
 82 |         "colab_type": "code",
 83 |         "id": "D9hc66hBfxtO",
 84 |         "colab": {
 85 |           "base_uri": "https://localhost:8080/",
 86 |           "height": 122
 87 |         },
 88 |         "outputId": "9fdc2de8-4872-446e-9f58-ac290b4ad1a8"
 89 |       },
 90 |       "source": [
 91 |         "# Mounting google drive, depends wherever the data is\n",
 92 |         "from google.colab import drive \n",
 93 |         "drive.mount('/content/drive') "
 94 |       ],
 95 |       "execution_count": null,
 96 |       "outputs": [
 97 |         {
 98 |           "output_type": "stream",
 99 |           "text": [
100 |             "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n",
101 |             "\n",
102 |             "Enter your authorization code:\n",
103 |             "··········\n",
104 |             "Mounted at /content/drive\n"
105 |           ],
106 |           "name": "stdout"
107 |         }
108 |       ]
109 |     },
110 |     {
111 |       "cell_type": "code",
112 |       "metadata": {
113 |         "colab_type": "code",
114 |         "id": "LxqOhOHhhnZx",
115 |         "colab": {}
116 |       },
117 |       "source": [
118 |         "# path to dataset\n",
119 |         "path_to_dataset = 'drive/My Drive/dataset.csv'\n",
120 |         "# path to transformer generated chunk embeddings eg. XLNet etc.\n",
121 |         "path_to_transformer_chunk_embeddings_train = 'drive/My Drive/XLNet/XLNet_train.npy'\n",
122 |         "path_to_transformer_chunk_embeddings_dev = 'drive/My Drive/XLNet/XLNet_dev.npy'\n",
123 |         "path_to_transformer_chunk_embeddings_test = 'drive/My Drive/XLNet/XLNet_test.npy'"
124 |       ],
125 |       "execution_count": null,
126 |       "outputs": []
127 |     },
128 |     {
129 |       "cell_type": "code",
130 |       "metadata": {
131 |         "colab_type": "code",
132 |         "id": "TRSi5B_5hcZJ",
133 |         "colab": {}
134 |       },
135 |       "source": [
136 |         "# loading dataset\n",
137 |         "dataset = pd.read_csv(path_to_dataset)"
138 |       ],
139 |       "execution_count": null,
140 |       "outputs": []
141 |     },
142 |     {
143 |       "cell_type": "code",
144 |       "metadata": {
145 |         "colab_type": "code",
146 |         "id": "09DRYpIrhrpJ",
147 |         "colab": {}
148 |       },
149 |       "source": [
150 |         "# loading the chunks embeddings\n",
151 |         "x_train0 = load(path_to_transformer_chunk_embeddings_train, allow_pickle = True)\n",
152 |         "x_dev0 = load(path_to_transformer_chunk_embeddings_dev, allow_pickle= True)\n",
153 |         "x_test0 = load(path_to_transformer_chunk_embeddings_test, allow_pickle= True)"
154 |       ],
155 |       "execution_count": null,
156 |       "outputs": []
157 |     },
158 |     {
159 |       "cell_type": "code",
160 |       "metadata": {
161 |         "colab_type": "code",
162 |         "id": "y2ORii42h_81",
163 |         "colab": {}
164 |       },
165 |       "source": [
166 |         "# loading the corresponding label for each case in dataset\n",
167 |         "dev = dataset.loc[dataset['split'] == 'dev']\n",
168 |         "train = dataset.loc[dataset['split'] == 'train']\n",
169 |         "test = dataset.loc[dataset['split'] == 'test']\n",
170 |         "\n",
171 |         "y_train0 = []\n",
172 |         "for i in range(train.shape[0]):\n",
173 |         "  y_train0.append(train.loc[i,'label'])\n",
174 |         "\n",
175 |         "y_dev0 = []\n",
176 |         "for i in range(dev.shape[0]):\n",
177 |         "  y_dev0.append(dev.loc[i+32305,'label'])\n",
178 |         "\n",
179 |         "y_test0 = []\n",
180 |         "for i in range(test.shape[0]):\n",
181 |         "  y_test0.append(test.loc[i+33299,'label'])"
182 |       ],
183 |       "execution_count": null,
184 |       "outputs": []
185 |     },
186 |     {
187 |       "cell_type": "code",
188 |       "metadata": {
189 |         "colab_type": "code",
190 |         "id": "ybuEL6kaNs9C",
191 |         "colab": {}
192 |       },
193 |       "source": [
194 |         "# Setting the maximum sequnce length and embedding dimension\n",
195 |         "MAX_SEQUENCE_LENGTH = 25\n",
196 |         "EMBEDDING_DIM = 768"
197 |       ],
198 |       "execution_count": null,
199 |       "outputs": []
200 |     },
201 |     {
202 |       "cell_type": "code",
203 |       "metadata": {
204 |         "colab_type": "code",
205 |         "id": "Mwmq8C42iPv9",
206 |         "colab": {}
207 |       },
208 |       "source": [
209 |         "# padding the vectors to maximum sequence length\n",
210 |         "for i in range(x_train0.shape[0]):\n",
211 |         "  padding_vector = np.zeros((MAX_SEQUENCE_LENGTH - x_train0[i].shape[0], EMBEDDING_DIM))\n",
212 |         "  x_train0[i] = np.concatenate((x_train0[i],padding_vector), axis = 0)\n",
213 |         "    \n",
214 |         "for i in range(x_dev0.shape[0]):\n",
215 |         "  padding_vector = np.zeros((MAX_SEQUENCE_LENGTH - x_dev0[i].shape[0], EMBEDDING_DIM))\n",
216 |         "  x_dev0[i] = np.concatenate((x_dev0[i],padding_vector),axis = 0)\n",
217 |         "    \n",
218 |         "for i in range(x_test0.shape[0]):\n",
219 |         "  padding_vector = np.zeros((MAX_SEQUENCE_LENGTH - x_test0[i].shape[0], EMBEDDING_DIM))\n",
220 |         "  x_test0[i] = np.concatenate((x_test0[i],padding_vector), axis = 0)"
221 |       ],
222 |       "execution_count": null,
223 |       "outputs": []
224 |     },
225 |     {
226 |       "cell_type": "code",
227 |       "metadata": {
228 |         "colab_type": "code",
229 |         "id": "YhP_WJKxNLBM",
230 |         "colab": {
231 |           "base_uri": "https://localhost:8080/",
232 |           "height": 544
233 |         },
234 |         "outputId": "2ae271f3-0203-450b-f9d0-fa69d2ea3063"
235 |       },
236 |       "source": [
237 |         "# Using Input layer to convert into required tensor shape\n",
238 |         "text_input = Input(shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM), dtype='float32', name='text')\n",
239 |         "# Using 3 Conv1D layers followed by max pooling layers\n",
240 |         "l_cov1= Conv1D(256, 2, activation='relu')(text_input)\n",
241 |         "l_pool1 = MaxPooling1D(2)(l_cov1)\n",
242 |         "l_cov2 = Conv1D(128, 2, activation='relu')(l_pool1)\n",
243 |         "l_pool2 = MaxPooling1D(2)(l_cov2)\n",
244 |         "l_cov3 = Conv1D(128, 2, activation='relu')(l_pool2)\n",
245 |         "l_pool3 = MaxPooling1D(4)(l_cov3)  # global max pooling\n",
246 |         "# Using the flatten layer to convert into 1D tensor\n",
247 |         "l_flat = Flatten()(l_pool3)\n",
248 |         "# passing the output embeddings through 2 dense layers\n",
249 |         "l_dense = Dense(128, activation='relu')(l_flat)\n",
250 |         "l_dense1 = Dense(32, activation='relu')(l_dense)\n",
251 |         "# Using sigmoid classifier\n",
252 |         "preds = Dense(1, activation='sigmoid')(l_dense1)\n",
253 |         "\n",
254 |         "model = Model(text_input, preds)\n",
255 |         "model.compile(loss='binary_crossentropy',\n",
256 |         "              optimizer='adam',\n",
257 |         "              metrics=['acc'])\n",
258 |         "\n",
259 |         "print(\"Simplified convolutional neural network\")\n",
260 |         "model.summary()"
261 |       ],
262 |       "execution_count": null,
263 |       "outputs": [
264 |         {
265 |           "output_type": "stream",
266 |           "text": [
267 |             "Simplified convolutional neural network\n",
268 |             "Model: \"model_1\"\n",
269 |             "_________________________________________________________________\n",
270 |             "Layer (type)                 Output Shape              Param #   \n",
271 |             "=================================================================\n",
272 |             "text (InputLayer)            (None, 25, 768)           0         \n",
273 |             "_________________________________________________________________\n",
274 |             "conv1d_1 (Conv1D)            (None, 24, 256)           393472    \n",
275 |             "_________________________________________________________________\n",
276 |             "max_pooling1d_1 (MaxPooling1 (None, 12, 256)           0         \n",
277 |             "_________________________________________________________________\n",
278 |             "conv1d_2 (Conv1D)            (None, 11, 128)           65664     \n",
279 |             "_________________________________________________________________\n",
280 |             "max_pooling1d_2 (MaxPooling1 (None, 5, 128)            0         \n",
281 |             "_________________________________________________________________\n",
282 |             "conv1d_3 (Conv1D)            (None, 4, 128)            32896     \n",
283 |             "_________________________________________________________________\n",
284 |             "max_pooling1d_3 (MaxPooling1 (None, 1, 128)            0         \n",
285 |             "_________________________________________________________________\n",
286 |             "flatten_1 (Flatten)          (None, 128)               0         \n",
287 |             "_________________________________________________________________\n",
288 |             "dense_1 (Dense)              (None, 128)               16512     \n",
289 |             "_________________________________________________________________\n",
290 |             "dense_2 (Dense)              (None, 32)                4128      \n",
291 |             "_________________________________________________________________\n",
292 |             "dense_3 (Dense)              (None, 1)                 33        \n",
293 |             "=================================================================\n",
294 |             "Total params: 512,705\n",
295 |             "Trainable params: 512,705\n",
296 |             "Non-trainable params: 0\n",
297 |             "_________________________________________________________________\n"
298 |           ],
299 |           "name": "stdout"
300 |         }
301 |       ]
302 |     },
303 |     {
304 |       "cell_type": "code",
305 |       "metadata": {
306 |         "colab_type": "code",
307 |         "id": "reXHxrJIUEs4",
308 |         "colab": {}
309 |       },
310 |       "source": [
311 |         "num_sequences = len(x_train0)\n",
312 |         "batch_size = 32\n",
313 |         "batches_per_epoch =  int(num_sequences/batch_size)\n",
314 |         "num_features= 768\n",
315 |         "def train_generator(): # function to generate batches of corresponding batch size\n",
316 |         "    x_list= x_train0\n",
317 |         "    y_list =  y_train0\n",
318 |         "    # Generate batches\n",
319 |         "    while True:\n",
320 |         "        for b in range(batches_per_epoch):\n",
321 |         "            longest_index = (b + 1) * batch_size - 1\n",
322 |         "            timesteps = len(max(x_train0[:(b + 1) * batch_size][-batch_size:], key=len))\n",
323 |         "            x_train = np.full((batch_size, timesteps, num_features), -99.)\n",
324 |         "            y_train = np.zeros((batch_size,  1))\n",
325 |         "            for i in range(batch_size):\n",
326 |         "                li = b * batch_size + i\n",
327 |         "                x_train[i, 0:len(x_list[li]), :] = x_list[li]\n",
328 |         "                y_train[i] = y_list[li]\n",
329 |         "            yield x_train, y_train"
330 |       ],
331 |       "execution_count": null,
332 |       "outputs": []
333 |     },
334 |     {
335 |       "cell_type": "code",
336 |       "metadata": {
337 |         "colab_type": "code",
338 |         "id": "XsuS57oHfUrJ",
339 |         "colab": {}
340 |       },
341 |       "source": [
342 |         "num_sequences_val = len(x_dev0)\n",
343 |         "batch_size_val = 32\n",
344 |         "batches_per_epoch_val = int(num_sequences_val/batch_size_val)\n",
345 |         "num_features= 768\n",
346 |         "def val_generator(): # function to generate batches of corresponding batch size\n",
347 |         "    x_list= x_dev0\n",
348 |         "    y_list =  y_dev0\n",
349 |         "    # Generate batches\n",
350 |         "    while True:\n",
351 |         "        for b in range(batches_per_epoch_val):\n",
352 |         "            longest_index = (b + 1) * batch_size_val - 1\n",
353 |         "            timesteps = len(max(x_dev0[:(b + 1) * batch_size_val][-batch_size_val:], key=len))\n",
354 |         "            x_train = np.full((batch_size_val, timesteps, num_features), 0)\n",
355 |         "            y_train = np.zeros((batch_size_val,  1))\n",
356 |         "            for i in range(batch_size_val):\n",
357 |         "                li = b * batch_size_val + i\n",
358 |         "                x_train[i, 0:len(x_list[li]), :] = x_list[li]\n",
359 |         "                y_train[i] = y_list[li]\n",
360 |         "            yield x_train, y_train"
361 |       ],
362 |       "execution_count": null,
363 |       "outputs": []
364 |     },
365 |     {
366 |       "cell_type": "code",
367 |       "metadata": {
368 |         "colab_type": "code",
369 |         "id": "g1v2gE2HmeRj",
370 |         "colab": {}
371 |       },
372 |       "source": [
373 |         "num_features= 768\n",
374 |         "def test_generator(): # function to generate batches of corresponding batch size\n",
375 |         "    x_list= x_test0\n",
376 |         "    y_list =  y_test0\n",
377 |         "    # Generate batches\n",
378 |         "    while True:\n",
379 |         "        for b in range(batches_per_epoch_test):\n",
380 |         "            if(b == batches_per_epoch_test-1): # An extra if else statement just to manage the last batch as it's size might not be equal to batch size \n",
381 |         "              longest_index = num_sequences_test - 1\n",
382 |         "              timesteps = len(max(x_test0[:longest_index + 1][-batch_size_test:], key=len))\n",
383 |         "              x_train = np.full((longest_index - b*batch_size_test, timesteps, num_features), -99.)\n",
384 |         "              y_train = np.zeros((longest_index - b*batch_size_test,  1))\n",
385 |         "              for i in range(longest_index - b*batch_size_test):\n",
386 |         "                  li = b * batch_size_test + i\n",
387 |         "                  x_train[i, 0:len(x_list[li]), :] = x_list[li]\n",
388 |         "                  y_train[i] = y_list[li]\n",
389 |         "            else:\n",
390 |         "                longest_index = (b + 1) * batch_size_test - 1\n",
391 |         "                timesteps = len(max(x_test0[:(b + 1) * batch_size_test][-batch_size_test:], key=len))\n",
392 |         "                x_train = np.full((batch_size_test, timesteps, num_features), -99.)\n",
393 |         "                y_train = np.zeros((batch_size_test,  1))\n",
394 |         "                for i in range(batch_size_test):\n",
395 |         "                    li = b * batch_size_test + i\n",
396 |         "                    x_train[i, 0:len(x_list[li]), :] = x_list[li]\n",
397 |         "                    y_train[i] = y_list[li]\n",
398 |         "            yield x_train, y_train"
399 |       ],
400 |       "execution_count": null,
401 |       "outputs": []
402 |     },
403 |     {
404 |       "cell_type": "code",
405 |       "metadata": {
406 |         "colab_type": "code",
407 |         "id": "YuuHo0_HfbFf",
408 |         "colab": {
409 |           "base_uri": "https://localhost:8080/",
410 |           "height": 207
411 |         },
412 |         "outputId": "9d36946e-e46e-49b5-90b4-004dc249702e"
413 |       },
414 |       "source": [
415 |         "# Setting the callback and training the model\n",
416 |         "call_reduce = ReduceLROnPlateau(monitor='val_acc', factor=0.95, patience=2, verbose=2,\n",
417 |         "                                mode='auto', min_delta=0.01, cooldown=0, min_lr=0)\n",
418 |         "\n",
419 |         "model.fit_generator(train_generator(), steps_per_epoch=batches_per_epoch, epochs=3,\n",
420 |         "                    validation_data=val_generator(), validation_steps=batches_per_epoch_val, callbacks =[call_reduce] )"
421 |       ],
422 |       "execution_count": null,
423 |       "outputs": [
424 |         {
425 |           "output_type": "stream",
426 |           "text": [
427 |             "WARNING (theano.tensor.blas): We did not find a dynamic library in the library_dir of the library we use for blas. If you use ATLAS, make sure to compile it with dynamics library.\n"
428 |           ],
429 |           "name": "stderr"
430 |         },
431 |         {
432 |           "output_type": "stream",
433 |           "text": [
434 |             "Epoch 1/3\n",
435 |             "1009/1009 [==============================] - 130s 129ms/step - loss: 0.4970 - acc: 0.7544 - val_loss: 0.4926 - val_acc: 0.7772\n",
436 |             "Epoch 2/3\n",
437 |             "1009/1009 [==============================] - 137s 136ms/step - loss: 0.4724 - acc: 0.7691 - val_loss: 0.5014 - val_acc: 0.7762\n",
438 |             "Epoch 3/3\n",
439 |             "1009/1009 [==============================] - 142s 141ms/step - loss: 0.4584 - acc: 0.7793 - val_loss: 0.4891 - val_acc: 0.7853\n",
440 |             "\n",
441 |             "Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.0009500000451225787.\n"
442 |           ],
443 |           "name": "stdout"
444 |         },
445 |         {
446 |           "output_type": "execute_result",
447 |           "data": {
448 |             "text/plain": [
449 |               "<keras.callbacks.callbacks.History at 0x7f5529f1cb38>"
450 |             ]
451 |           },
452 |           "metadata": {
453 |             "tags": []
454 |           },
455 |           "execution_count": 15
456 |         }
457 |       ]
458 |     },
459 |     {
460 |       "cell_type": "code",
461 |       "metadata": {
462 |         "colab_type": "code",
463 |         "id": "E0LgRWkdjYFm",
464 |         "colab": {
465 |           "base_uri": "https://localhost:8080/",
466 |           "height": 34
467 |         },
468 |         "outputId": "77ce0a57-3499-4487-ee29-fe8f40ce58be"
469 |       },
470 |       "source": [
471 |         "num_sequences_test = len(x_test0)\n",
472 |         "batch_size_test = 32\n",
473 |         "batches_per_epoch_test = int(num_sequences_test/batch_size_test) + 1\n",
474 |         "num_features= 768\n",
475 |         "# evaluating on the test data\n",
476 |         "model.evaluate_generator(test_generator(), steps= batches_per_epoch_test)"
477 |       ],
478 |       "execution_count": null,
479 |       "outputs": [
480 |         {
481 |           "output_type": "execute_result",
482 |           "data": {
483 |             "text/plain": [
484 |               "[0.548132598400116, 0.7691292762756348]"
485 |             ]
486 |           },
487 |           "metadata": {
488 |             "tags": []
489 |           },
490 |           "execution_count": 16
491 |         }
492 |       ]
493 |     },
494 |     {
495 |       "cell_type": "code",
496 |       "metadata": {
497 |         "id": "0TAUfLPHHBfl",
498 |         "colab_type": "code",
499 |         "colab": {}
500 |       },
501 |       "source": [
502 |         "# defining a function which calculates various metrics such as micro and macro precision, accuracy and f1\n",
503 |         "def metrics_calculator(preds, test_labels):\n",
504 |         "    cm = confusion_matrix(test_labels, preds)\n",
505 |         "    TP = []\n",
506 |         "    FP = []\n",
507 |         "    FN = []\n",
508 |         "    for i in range(0,2):\n",
509 |         "        summ = 0\n",
510 |         "        for j in range(0,2):\n",
511 |         "            if(i!=j):\n",
512 |         "                summ=summ+cm[i][j]\n",
513 |         "\n",
514 |         "        FN.append(summ)\n",
515 |         "    for i in range(0,2):\n",
516 |         "        summ = 0\n",
517 |         "        for j in range(0,2):\n",
518 |         "            if(i!=j):\n",
519 |         "                summ=summ+cm[j][i]\n",
520 |         "\n",
521 |         "        FP.append(summ)\n",
522 |         "    for i in range(0,2):\n",
523 |         "        TP.append(cm[i][i])\n",
524 |         "    precision = []\n",
525 |         "    recall = []\n",
526 |         "    for i in range(0,2):\n",
527 |         "        precision.append(TP[i]/(TP[i] + FP[i]))\n",
528 |         "        recall.append(TP[i]/(TP[i] + FN[i]))\n",
529 |         "\n",
530 |         "    macro_precision = sum(precision)/2\n",
531 |         "    macro_recall = sum(recall)/2\n",
532 |         "    micro_precision = sum(TP)/(sum(TP) + sum(FP))\n",
533 |         "    micro_recall = sum(TP)/(sum(TP) + sum(FN))\n",
534 |         "    micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)\n",
535 |         "    macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)\n",
536 |         "    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1"
537 |       ],
538 |       "execution_count": null,
539 |       "outputs": []
540 |     },
541 |     {
542 |       "cell_type": "code",
543 |       "metadata": {
544 |         "colab_type": "code",
545 |         "id": "aLiyuj2xmc64",
546 |         "colab": {
547 |           "base_uri": "https://localhost:8080/",
548 |           "height": 34
549 |         },
550 |         "outputId": "d6455f0a-e906-4ab9-eb01-cfcd5f699b18"
551 |       },
552 |       "source": [
553 |         "# getting the predicted labels on the test data\n",
554 |         "preds = model.predict_generator(test_generator(), steps= batches_per_epoch_test)\n",
555 |         "y_pred = preds > 0.5\n",
556 |         "\n",
557 |         "# Calculating all metrics on test data predicted label\n",
558 |         "print(metrics_calculator(y_pred, y_test0[:-1]))"
559 |       ],
560 |       "execution_count": null,
561 |       "outputs": [
562 |         {
563 |           "output_type": "stream",
564 |           "text": [
565 |             "(0.7789586641591649, 0.7687549494826431, 0.7738231714660547, 0.7691292875989446, 0.7691292875989446, 0.7691292875989446)\n"
566 |           ],
567 |           "name": "stdout"
568 |         }
569 |       ]
570 |     },
571 |     {
572 |       "cell_type": "code",
573 |       "metadata": {
574 |         "colab_type": "code",
575 |         "id": "AwHXW9Anmw2y",
576 |         "colab": {
577 |           "base_uri": "https://localhost:8080/",
578 |           "height": 34
579 |         },
580 |         "outputId": "d8d76501-0466-4ca4-aa82-272de0dcdb99"
581 |       },
582 |       "source": [
583 |         "# getting the predicted labels on the dev data\n",
584 |         "preds = model.predict_generator(val_generator(), steps= batches_per_epoch_val)\n",
585 |         "y_pred_dev = preds > 0.5\n",
586 |         "\n",
587 |         "# Calculating all metrics on dev data predicted label\n",
588 |         "print(metrics_calculator(y_pred_dev, y_dev0[:-2]))"
589 |       ],
590 |       "execution_count": null,
591 |       "outputs": [
592 |         {
593 |           "output_type": "stream",
594 |           "text": [
595 |             "(0.7864010120177103, 0.7852822580645161, 0.7858412368659745, 0.7852822580645161, 0.7852822580645161, 0.7852822580645161)\n"
596 |           ],
597 |           "name": "stdout"
598 |         }
599 |       ]
600 |     },
601 |     {
602 |       "cell_type": "code",
603 |       "metadata": {
604 |         "colab_type": "code",
605 |         "id": "9N9ZqI1rm1Gd",
606 |         "colab": {}
607 |       },
608 |       "source": [
609 |         "# saving the trained model\n",
610 |         "model.save('CNN_XLNet.h5')  # creates a HDF5 file 'CNN_XLNet.h5'"
611 |       ],
612 |       "execution_count": null,
613 |       "outputs": []
614 |     },
615 |     {
616 |       "cell_type": "code",
617 |       "metadata": {
618 |         "colab_type": "code",
619 |         "id": "zB5C5WGlrJ5i",
620 |         "colab": {}
621 |       },
622 |       "source": [
623 |         ""
624 |       ],
625 |       "execution_count": null,
626 |       "outputs": []
627 |     }
628 |   ]
629 | }


--------------------------------------------------------------------------------
/Models/CNN/README.md:
--------------------------------------------------------------------------------
 1 | ## CNN
 2 | 
 3 | 
 4 |  **Input** : 
 5 |  **Transformers Chunk Embeddings**(eg. BERT, XLNet, RoBERTa.) with every chunk *Embedding Dim: 768*. For more details please refer to *CJPE/transformers/*  folder.
 6 | 
 7 | We pass the Chunk embeddings through a 3 layer **Conv1D** network with **MaxPool** layer after each Conv layer( For more details please refer to *CJPE/CNN/CNN_final.ipynb* )
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 |  
21 | 
22 | 


--------------------------------------------------------------------------------
/Models/Classical Models/README.md:
--------------------------------------------------------------------------------
 1 | As mentioned in the paper we have used two types of embeddings with classical models: **Sent2Vec** and **Doc2Vec**. Both of these models were trained on the train set of **ILDC<sub>multi</sub>**. We are not releasing the trained models of these but we are releasing the mapped vectors from sentences (in case of sent2vec) and documents (in case of doc2vec) as we believe these are more convenient to use. If you want to train your own models you can use the train set of **ILDC<sub>multi</sub>** to train the embeddings. We used the following codebase for reference:
 2 | * For sent2vec. [(here)](https://ilmoirfan.com/how-to-train-sent2vec-model/)
 3 | * For doc2vec. [(here)](https://radimrehurek.com/gensim/models/doc2vec.html)
 4 | 
 5 | ### Sent2Vec embeddings of ILDC<sub>multi</sub>: ### 
 6 | 
 7 | **Config**: mincount = 5, Vocab size = 750000, n-gram = 2, dimension size = 200. You can find the numpy files [here](https://drive.google.com/drive/folders/1d9TTu06NQzVL9WqZY3qNjKGt3w9PSUh3?usp=sharing)
 8 | 
 9 | ### Doc2Vec embeddings of ILDC<sub>multi</sub>: ###
10 | 
11 | We are releasing two numpy files from two different versions of doc2vec
12 | 
13 | **Config**: dimension size = 500. You can find the numpy files [here](https://drive.google.com/drive/folders/1G0-8-j1br6aPa3E97HxXguvsfBiG2xUX?usp=sharing)
14 | 
15 | **Config**: dimension size = 1000. You can find the numpy files [here](https://drive.google.com/drive/folders/17qQe9t4BwD1VIjd2DGwpimC7rNlBioGH?usp=sharing)
16 | 
17 | ## For reproducing results: ##
18 | * Sent2Vec: You will need the numpy files for sent2vec mentioned above. You can then run the following command:
19 | ```
20 | python classical_models_sent2vec_avgd.py path_to/train_set_npy_file path_to/test_set_npy_file path/to/dev_set_npy_file
21 | ```
22 | * Doc2Vec: You will need the numpy files for Doc2Vec mentioned above (Use any one configuration). You can then run the following command:
23 | ```
24 | python classical_models_doc2vec.py path_to/train_set_npy_file path_to_train_set_labels_file path_to/test_set_npy_file path_to/test_set_labels_file path_to/dev_set_npy_file path/to/dev_set_labels_file
25 | ```
26 | 
27 | For either of the embedding type the output will be 3 files in the working directory. **SVM_results.txt, RF_results.txt, LR_results.txt** for each of the hyperparameter setting. **Be careful while using 1000 dimension size Doc2Vec model with SVM. It can take up a lot of time to generate results.**
28 | 


--------------------------------------------------------------------------------
/Models/Classical Models/classical_models_doc2vec.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | from sklearn.ensemble import RandomForestClassifier
  4 | from sklearn.linear_model import LogisticRegression
  5 | from sklearn import model_selection, naive_bayes, svm
  6 | from sklearn.metrics import confusion_matrix
  7 | from sklearn.metrics import accuracy_score
  8 | 
  9 | path_train = sys.argv[1]
 10 | path_train_labels = sys.argv[2]
 11 | path_test = sys.argv[3]
 12 | path_test_labels = sys.argv[4]
 13 | path_dev = sys.argv[5]
 14 | path_dev_labels = sys.argv[6]
 15 | 
 16 | x_train=np.load(path_train)
 17 | y_train=np.load(path_train_labels)
 18 | x_test=np.load(path_test)
 19 | y_test=np.load(path_test_labels)
 20 | x_dev=np.load(path_dev)
 21 | y_dev=np.load(path_dev_labels)
 22 | 
 23 | 
 24 | #Utility function that calculates the metric scores given the predicted and true labels
 25 | def metrics_calculator(preds, test_labels):
 26 |     cm = confusion_matrix(test_labels, preds)
 27 |     TP, FP, FN = [], [], []
 28 |     for i in range(0,2):
 29 |         summ = 0
 30 |         for j in range(0,2):
 31 |             if(i!=j):
 32 |                 summ=summ+cm[i][j]
 33 | 
 34 |         FN.append(summ)
 35 |     for i in range(0,2):
 36 |         summ = 0
 37 |         for j in range(0,2):
 38 |             if(i!=j):
 39 |                 summ=summ+cm[j][i]
 40 | 
 41 |         FP.append(summ)
 42 |     for i in range(0,2):
 43 |         TP.append(cm[i][i])
 44 |     precision = []
 45 |     recall = []
 46 |     for i in range(0,2):
 47 |         precision.append(TP[i]/(TP[i] + FP[i]))
 48 |         recall.append(TP[i]/(TP[i] + FN[i]))
 49 | 
 50 |     macro_precision = sum(precision)/2
 51 |     macro_recall = sum(recall)/2
 52 |     micro_precision = sum(TP)/(sum(TP) + sum(FP))
 53 |     micro_recall = sum(TP)/(sum(TP) + sum(FN))
 54 |     micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)
 55 |     macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)
 56 |     return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1
 57 | 
 58 | # RF utility function that creates the file RF_results.txt in which we vary the n_estimators parameter from 50 to 500
 59 | # with increments of 50
 60 | def RF_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels):
 61 |     f = open("RF_results.txt", "w+")
 62 |     f.write("Varying the n_estimators from 50 to 500\n\n")
 63 |     for n_est in range(50,500,50):
 64 |         clf=RandomForestClassifier(n_estimators=n_est)
 65 |         clf.fit(train_avg,train_labels)
 66 |         d_preds = clf.predict(dev_avg)
 67 |         Heading = "For n_estimators: " + str(n_est) + "\n"
 68 |         d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
 69 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
 70 |         d_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
 71 |         f.write(Heading + "Dev set:\n"+ d_res + d_metrics)
 72 |         
 73 |         t_preds = clf.predict(test_avg)
 74 |         t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
 75 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
 76 |         t_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
 77 |         f.write("Test set:\n"+ t_res + t_metrics + "\n\n")
 78 |         
 79 |     f.close()
 80 | 
 81 | # LR utility function that creates the file RF_results.txt in which we vary the max_iters parameter from 50 to 500
 82 | # with increments of 50
 83 | def LR_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels):
 84 |     f = open("LR_results.txt", "w+")
 85 |     f.write("Varying the max_iters from 50 to 500\n\n")
 86 |     for it in range(50,500,50):
 87 |         LR = LogisticRegression(C=1, max_iter =it)
 88 |         LR.fit(train_avg, train_labels)
 89 |         d_preds = LR.predict(dev_avg)
 90 |         Heading = "For max_iters: " + str(it) + "\n"
 91 |         d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
 92 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
 93 |         d_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
 94 |         f.write(Heading + "Dev set:\n"+ d_res + d_metrics)
 95 |         
 96 |         t_preds = LR.predict(test_avg)
 97 |         t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
 98 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
 99 |         t_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
100 |         f.write("Test set:\n"+ t_res + t_metrics + "\n\n")
101 |         
102 |     f.close()
103 | 
104 | 
105 | 
106 | # SVM utility function that gives results by creating file "SVM_avgi_results.txt" in the working
107 | # directory
108 | # Remember that in the hyperparams we are only varying the kernels here from linear to poly to rbf
109 | def SVM_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels):
110 |     f = open("SVM_results.txt", "w+")
111 |     f.write("Varying the kernels: \n\n")
112 |     kers = ["linear", "poly", "rbf"]
113 |     for k in kers:
114 |         print("Running for {0}".format(k))
115 |         SVM = svm.SVC(C=1, kernel=k)
116 |         SVM.fit(train_avg, train_labels)
117 |         d_preds = SVM.predict(dev_avg)
118 |         Heading = "For kernel: " + k + "\n"
119 |         d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
120 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
121 |         d_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
122 |         f.write(Heading + "Dev set:\n"+ d_res + d_metrics)
123 | 
124 |         t_preds = SVM.predict(test_avg)
125 |         t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
126 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
127 |         t_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
128 |         f.write("Test set:\n"+ t_res + t_metrics + "\n\n")
129 |     f.close()
130 | 
131 | 
132 | # Train and get the results for each model
133 | LR_scores(x_train,x_dev,x_test,y_train1,y_dev1,y_test1)
134 | RF_scores(x_train,x_dev,x_test,y_train1,y_dev1,y_test1)
135 | SVM_scores(x_train,x_dev,x_test,y_train1,y_dev1,y_test1)


--------------------------------------------------------------------------------
/Models/Classical Models/classical_models_sent2vec_avgd.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | from sklearn.ensemble import RandomForestClassifier
  4 | from sklearn.linear_model import LogisticRegression
  5 | from sklearn import model_selection, naive_bayes, svm
  6 | from sklearn.metrics import confusion_matrix
  7 | from sklearn.metrics import accuracy_score
  8 | 
  9 | path_x = sys.argv[1]
 10 | path_y = sys.argv[2]
 11 | path_z = sys.argv[3]
 12 | 
 13 | # Here we are averaging out the embeddings of all the sentence embeddings of a document.
 14 | # We have also tried out max and min pooling as well but they were giving worse results.
 15 | # Modify the code here accordingly if you want to try out those methods as well.
 16 | 
 17 | def average_out_embeddings(npy_file_path):
 18 |     x=np.load(npy_file_path,allow_pickle='true')
 19 |     x_split_data=x[:,0]
 20 |     y_split=x[:,1]
 21 |     y_split=y_split.astype('int')
 22 |     x_split=np.zeros([32305,200])
 23 |     for i in range(0,32305):
 24 |         b=np.zeros([len(x_split_data[i]),200])
 25 |         for j in range(0,len(x_split_data[i])):
 26 |             b[j,:]=x_split_data[i][j]
 27 |         b=np.sum(b,axis=0)
 28 |         x_split[i,:]=b/len(x_split_data[i])
 29 |     return x_split, y_split
 30 | 
 31 | #prepare data for each set by averaging out sentence embeddings
 32 | x_train, y_train = average_out_embeddings(path_x)
 33 | x_test, y_test = average_out_embeddings(path_y)
 34 | x_dev, y_dev = average_out_embeddings(path_z)
 35 | 
 36 | 
 37 | #prepare true labels for training
 38 | y_train1=np.zeros([32305,1])
 39 | for i in range(0,32305):
 40 |     y_train1[i][0]=y_train[i]
 41 | y_test1=np.zeros([1517,1])
 42 | for i in range(0,1517):
 43 |     y_test1[i][0]=y_test[i]
 44 | y_dev1=np.zeros([994,1])
 45 | for i in range(0,994):
 46 |     y_dev1[i][0]=y_dev[i]
 47 | 
 48 | 
 49 | #Utility function that calculates the metric scores given the predicted and true labels
 50 | def metrics_calculator(preds, test_labels):
 51 |     cm = confusion_matrix(test_labels, preds)
 52 |     TP, FP, FN = [], [], []
 53 |     for i in range(0,2):
 54 |         summ = 0
 55 |         for j in range(0,2):
 56 |             if(i!=j):
 57 |                 summ=summ+cm[i][j]
 58 | 
 59 |         FN.append(summ)
 60 |     for i in range(0,2):
 61 |         summ = 0
 62 |         for j in range(0,2):
 63 |             if(i!=j):
 64 |                 summ=summ+cm[j][i]
 65 | 
 66 |         FP.append(summ)
 67 |     for i in range(0,2):
 68 |         TP.append(cm[i][i])
 69 |     precision = []
 70 |     recall = []
 71 |     for i in range(0,2):
 72 |         precision.append(TP[i]/(TP[i] + FP[i]))
 73 |         recall.append(TP[i]/(TP[i] + FN[i]))
 74 | 
 75 |     macro_precision = sum(precision)/2
 76 |     macro_recall = sum(recall)/2
 77 |     micro_precision = sum(TP)/(sum(TP) + sum(FP))
 78 |     micro_recall = sum(TP)/(sum(TP) + sum(FN))
 79 |     micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)
 80 |     macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)
 81 |     return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1
 82 | 
 83 | 
 84 | # SVM utility function that gives results by creating file "SVM_avgi_results.txt" in the working
 85 | # directory
 86 | # Remember that in the hyperparams we are only varying the kernels here from linear to poly to rbf
 87 | def SVM_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels):
 88 |     f = open("SVM_avgi_results.txt", "w+")
 89 |     f.write("Varying the kernels: \n\n")
 90 |     kers = ["linear", "poly", "rbf"]
 91 |     for k in kers:
 92 |         print("Running for {0}".format(k))
 93 |         SVM = svm.SVC(C=1, kernel=k)
 94 |         SVM.fit(train_avg, train_labels)
 95 |         d_preds = SVM.predict(dev_avg)
 96 |         Heading = "For kernel: " + k + "\n"
 97 |         d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
 98 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
 99 |         d_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
100 |         f.write(Heading + "Dev set:\n"+ d_res + d_metrics)
101 | 
102 |         t_preds = SVM.predict(test_avg)
103 |         t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
104 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
105 |         t_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
106 |         f.write("Test set:\n"+ t_res + t_metrics + "\n\n")
107 |     f.close()
108 | 
109 | # RF utility function that creates the file RF_results.txt in which we vary the n_estimators parameter from 50 to 500
110 | # with increments of 50
111 | def RF_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels):
112 |     f = open("RF_results.txt", "w+")
113 |     f.write("Varying the n_estimators from 50 to 500\n\n")
114 |     for n_est in range(50,500,50):
115 |         clf=RandomForestClassifier(n_estimators=n_est)
116 |         clf.fit(train_avg,train_labels)
117 |         d_preds = clf.predict(dev_avg)
118 |         Heading = "For n_estimators: " + str(n_est) + "\n"
119 |         d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
120 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
121 |         d_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
122 |         f.write(Heading + "Dev set:\n"+ d_res + d_metrics)
123 |         
124 |         t_preds = clf.predict(test_avg)
125 |         t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
126 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
127 |         t_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
128 |         f.write("Test set:\n"+ t_res + t_metrics + "\n\n")
129 |         
130 |     f.close()
131 | 
132 | # LR utility function that creates the file RF_results.txt in which we vary the max_iters parameter from 50 to 500
133 | # with increments of 50
134 | def LR_scores(train_avg, dev_avg, test_avg, train_labels, dev_labels, test_labels):
135 |     f = open("LR_results.txt", "w+")
136 |     f.write("Varying the max_iters from 50 to 500\n\n")
137 |     for it in range(50,500,50):
138 |         LR = LogisticRegression(C=1, max_iter =it)
139 |         LR.fit(train_avg, train_labels)
140 |         d_preds = LR.predict(dev_avg)
141 |         Heading = "For max_iters: " + str(it) + "\n"
142 |         d_res = "Accuracy -> " + str(accuracy_score(d_preds, dev_labels)*100) + "\n"
143 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(d_preds, dev_labels)
144 |         d_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
145 |         f.write(Heading + "Dev set:\n"+ d_res + d_metrics)
146 |         
147 |         t_preds = LR.predict(test_avg)
148 |         t_res = "Accuracy -> " + str(accuracy_score(t_preds, test_labels)*100) + "\n"
149 |         macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(t_preds, test_labels)
150 |         t_metrics = "Micros : " + str(micro_precision) + " " + str(micro_recall) + " " + str(micro_f1) + " Macros: " + str(macro_precision) + " " + str(macro_recall) + " " + str(macro_f1) + "\n"
151 |         f.write("Test set:\n"+ t_res + t_metrics + "\n\n")
152 |         
153 |     f.close()
154 | 
155 | # Train and get the results for each model
156 | LR_scores(x_train,x_dev,x_test,y_train1,y_dev1,y_test1)
157 | RF_scores(x_train,x_dev,x_test,y_train1,y_dev1,y_test1)
158 | SVM_scores(x_train,x_dev,x_test,y_train1,y_dev1,y_test1)


--------------------------------------------------------------------------------
/Models/Explanations/README.md:
--------------------------------------------------------------------------------
 1 | # File Descriptions #
 2 | 
 3 | **File name**| **Description**
 4 | -------------|----------------
 5 | **occ_explanations_hierarchical.ipynb:**| This file was used for the BiGRU half occlusion (refer to paper). We trained the BiGRU model without attention and used the defined occlusion method to give wieghts to each chunk in every document of **ILDC<sub>expert</sub>**.
 6 | **XLNet_noatt_2layer_occlusion:**| This file was used for XLNet half occlusion (refer to paper). We used the occlusion weights generated from the above file to rank and extract sentences from each chunk having a positive chunk score.
 7 | **metrics and results**| The results of machine explanations and annotated explanations' json files are stored in this folder. This also has scripts for each metric and results.
 8 | 
 9 | To make reproducibility easier, we have also released occlusion weights of each document after the BiGRU step [here](https://drive.google.com/drive/folders/1g4di6WHAnKPoUl8gQ8YcQUOILBmmy1q7?usp=sharing). You can plot these yourself if desired.
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/README.md:
--------------------------------------------------------------------------------
 1 | # File Description
 2 | 
 3 | **File Name**| **Description**
 4 | ---------|------------
 5 | **occ_explanations.json**| This json file contains the explanatory sentences collected using the algrithm in the paper. You can access the explanations by using the the file name of any file in **ILDC<sub>exper</sub>**
 6 | **gold_explanations.json**| This json file contains the explanatory sentences collected from the users' annotation. To access the annotations you need to give the file name of any file in **ILDC<sub>expert</sub>**, then the user number like this "User 2" (number should be between 1 to 5) and then finally "exp" if you want the explanation or "verdict" if you want to access the verdict of judgment given by the user (Note that this is not the true label).
 7 | **gold_explanations_ranked.json**| This json file contains the explanatory sentences collected from the users' annotation. The difference between this file and above file is that in this file you can access the sentences belonging to a particular rank as well (Rank 1 to 10). So, to access the explanations you need to give the file name of any file in **ILDC<sub>expert</sub>**, then the user number like this "User 2" (number should be between 1 to 5) and then the rank as "Rank4", finally "exp" if you want the explanation or "verdict" if you want to access the verdict of judgment given by the user.
 8 | **anno_explanations_scores.xlsx**| This is an xlsx file having 56 sheets for each of the file in **ILDC<sub>expert</sub>**, and using this one can refer to scores of the metrics (like ROUGE-1, ROUGE-2, ROUGE-l, Jaccard Similarity, Overlap-min, Overlap-max, METEOR and BLEU) for every single file. Each sheet contains the "all ranks combined explanations" and verdit as well.
 9 | **xl_anno_make.py**| This is the script for generating anno_explanations_scores.xlsx
10 | **metricmaker.py**| To reproduce machine vs. user explanations results, use this file. You will need the files **gold_explanations_ranked.json** and **occ_explanations.json** to generated results. You can manipulate the code according to what collections of ranks (e.g. Ranks 1 to 5) you want to compare the machine explanations with. Currently, it has been set to generate the combinations: Ranks 1 to 10, Ranks 1 only, Rank 2 only, Rank 3 only, Rank 4 only, Rank 5 only, Rank 6 only, Rank 7 only, Rank 8 only, Rank 9 only, Rank 10 only, Ranks 1 to 5, Ranks 5 to 10. The results files will be generated in the folder result_files.
11 | **result_files**| This folder contains the results generated from metricmaker.py
12 | **among user scores**| This folder contains the scores averaged across all 56 files of each metric type (ROUGE-1, ROUGE-2, ROUGE-l, Jaccard Similarity, Overlap-min, Overlap-max, METEOR and BLEU) for every user from 1 to 5
13 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/among users scores/README.md:
--------------------------------------------------------------------------------
 1 | # User vs. User scores
 2 | 
 3 | **Note**: You will need the anno_explanations_scores.xlsx (in the parent directory of the current directory) file to create the averaged out results.
 4 | 
 5 | The **uservsusermetric.ipynb** file creates the file **resultsamongusers.xlsx**, that contains 8 sheets with each sheet corresponding to the eight metrics we defined in the paper that is ROUGE-1, ROUGE-2, ROUGE-L, Jaccard Similarity, BLEU, METEOR, Overlap-min, Overlap-max.
 6 | 
 7 | The sheets in the file **resultsamongusers.xlsx** are in the following order:
 8 | 
 9 | **Sheet number**|**Metric**
10 | ----------------|----------
11 | 1| Jaccard Similarity
12 | 2| Overlap Min
13 | 3| Overlap Max
14 | 4| ROUGE-1
15 | 5| ROUGE-2
16 | 6| ROUGE-L
17 | 7| BLEU
18 | 8| METEOR
19 | 
20 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/among users scores/resultsamongusers.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Models/Explanations/metrics and results/among users scores/resultsamongusers.xlsx


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/among users scores/uservsusermetric.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "df = []\n",
 11 |     "for i in range(56):\n",
 12 |     "    df.append(pd.read_excel(\"anno_explanations_scores.xlsx\", \"Sheet\" + str(i+1)))"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 10,
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "name": "stdout",
 22 |      "output_type": "stream",
 23 |      "text": [
 24 |       "['User 1vs User 2 = 0.3788', 'User 1vs User 3 = 0.6015', 'User 1vs User 4 = 0.5862', 'User 1vs User 5 = 0.4524', 'User 2vs User 1 = 0.3788', 'User 2vs User 3 = 0.2668', 'User 2vs User 4 = 0.2637', 'User 2vs User 5 = 0.3401', 'User 3vs User 1 = 0.6015', 'User 3vs User 2 = 0.2668', 'User 3vs User 4 = 0.896', 'User 3vs User 5 = 0.3973', 'User 4vs User 1 = 0.5862', 'User 4vs User 2 = 0.2637', 'User 4vs User 3 = 0.896', 'User 4vs User 5 = 0.4003', 'User 5vs User 1 = 0.4524', 'User 5vs User 2 = 0.3401', 'User 5vs User 3 = 0.3973', 'User 5vs User 4 = 0.4003', '']\n"
 25 |      ]
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "metrics = [\"Jaccard\", \"OverlapMin\", \"OverlapMax\", \"ROUGE-1\", \"ROUGE-2\", \"ROUGE-L\", \"BLEU\", \"METEOR\"]\n",
 30 |     "text = df[0][metric[0]].iloc[0]\n",
 31 |     "text = text.replace(\"\\n\\n\",\"\\n\")\n",
 32 |     "scores = text.split(\"\\n\")\n",
 33 |     "sc_matrix = [[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0]]\n",
 34 |     "\n",
 35 |     "print(scores)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 25,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "def add_matrix(X,Y):\n",
 45 |     "    result = [[X[i][j] + Y[i][j]  for j in range(len(X[0]))] for i in range(len(X))]\n",
 46 |     "    return result\n",
 47 |     "\n",
 48 |     "def div_matrix(X,scalar):\n",
 49 |     "    result = [[round(X[i][j]/scalar,4)  for j in range(len(X[0]))] for i in range(len(X))]\n",
 50 |     "    return result"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 30,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "import xlsxwriter\n",
 60 |     "book = xlsxwriter.Workbook(\"resultsamongusers\" + \".xlsx\") #creating new xlsx file \n",
 61 |     "cell_format = book.add_format()\n",
 62 |     "cell_format.set_text_wrap()\n",
 63 |     "cell_format.set_align(\"top\")\n",
 64 |     "\n",
 65 |     "for metric in metrics:\n",
 66 |     "    adding_matrix = [[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0]]\n",
 67 |     "    for f in range(56):\n",
 68 |     "        text = df[f][metric].iloc[0]\n",
 69 |     "        text = text.replace(\"\\n\\n\",\"\\n\")\n",
 70 |     "        scores = text.split(\"\\n\")\n",
 71 |     "        sc_matrix = [[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0]]\n",
 72 |     "        for i in range(20):\n",
 73 |     "            sc_matrix[int(scores[i][5])-1][int(scores[i][14])-1] = float(scores[i][18:])\n",
 74 |     "        \n",
 75 |     "        adding_matrix = add_matrix(adding_matrix, sc_matrix)\n",
 76 |     "        \n",
 77 |     "    adding_matrix = div_matrix(adding_matrix, 56)\n",
 78 |     "    \n",
 79 |     "    sheet = book.add_worksheet()\n",
 80 |     "    row=0\n",
 81 |     "    column=0\n",
 82 |     "    for ii,row_score in enumerate(adding_matrix):\n",
 83 |     "        for jj,score in enumerate(row_score):\n",
 84 |     "            sheet.write(row,column,adding_matrix[ii][jj])\n",
 85 |     "            column+=1\n",
 86 |     "        \n",
 87 |     "        row+=1\n",
 88 |     "        column=0\n",
 89 |     "            "
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 31,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "book.close()"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": []
107 |   }
108 |  ],
109 |  "metadata": {
110 |   "kernelspec": {
111 |    "display_name": "Python 3",
112 |    "language": "python",
113 |    "name": "python3"
114 |   },
115 |   "language_info": {
116 |    "codemirror_mode": {
117 |     "name": "ipython",
118 |     "version": 3
119 |    },
120 |    "file_extension": ".py",
121 |    "mimetype": "text/x-python",
122 |    "name": "python",
123 |    "nbconvert_exporter": "python",
124 |    "pygments_lexer": "ipython3",
125 |    "version": "3.7.6"
126 |   }
127 |  },
128 |  "nbformat": 4,
129 |  "nbformat_minor": 4
130 | }
131 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/anno_explanations_scores.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Models/Explanations/metrics and results/anno_explanations_scores.xlsx


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/metricmaker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[72]:
  5 | 
  6 | 
  7 | import json
  8 | g_json = open("gold_explanations_ranked.json", "r")
  9 | gold_exp = json.load(g_json)
 10 | o_json = open("occ_explanations.json", "r")
 11 | occ_exp = json.load(o_json)
 12 | 
 13 | 
 14 | # In[73]:
 15 | 
 16 | 
 17 | files = list(occ_exp.keys())
 18 | 
 19 | 
 20 | # In[76]:
 21 | 
 22 | 
 23 | import nltk
 24 | from nltk.stem import PorterStemmer 
 25 | from nltk.tokenize import word_tokenize 
 26 | from rouge import Rouge 
 27 | import nltk.translate
 28 | from nltk.translate import meteor_score
 29 | import progressbar
 30 | import numpy as np
 31 | 
 32 | # In[77]:
 33 | 
 34 | 
 35 | def get_BLEU_score(ref_text, machine_text):
 36 |     tok_ref_text = word_tokenize(ref_text)
 37 |     tok_machine_text = word_tokenize(machine_text)
 38 |     sc = nltk.translate.bleu_score.sentence_bleu([tok_ref_text], tok_machine_text, weights = (0.5,0.5))
 39 |     return sc
 40 | 
 41 | def jaccard_similarity(query, document):
 42 |     query = word_tokenize(query)
 43 |     document = word_tokenize(document)
 44 |     intersection = set(query).intersection(set(document))
 45 |     union = set(query).union(set(document))
 46 |     if(len(union)==0):
 47 |         return 0
 48 |     return len(intersection)/len(union)
 49 | 
 50 | def overlap_coefficient_min(query, document):
 51 |     query = word_tokenize(query)
 52 |     document = word_tokenize(document)
 53 |     intersection = set(query).intersection(set(document))
 54 |     den = min(len(set(query)),len(set(document)))
 55 |     if(den==0):
 56 |         return 0
 57 |     return len(intersection)/den
 58 | 
 59 | def overlap_coefficient_max(query, document):
 60 |     query = word_tokenize(query)
 61 |     document = word_tokenize(document)
 62 |     intersection = set(query).intersection(set(document))
 63 |     den = max(len(set(query)),len(set(document)))
 64 |     if(den==0):
 65 |         return 0
 66 |     return len(intersection)/den
 67 | 
 68 | 
 69 | # In[86]:
 70 | 
 71 | 
 72 | def occ_result_maker(file_to_write, Rank_initial, Rank_final, occ_exp, gold_exp):
 73 |     rouge1 = []
 74 |     rouge2 = []
 75 |     rougel = []
 76 |     jaccard = []
 77 |     bleu = []
 78 |     meteor = []
 79 |     overlap_min = []
 80 |     overlap_max = []
 81 |     
 82 |     for u in range(5):
 83 |         user = "User " + str(u+1)
 84 |         r1 = []
 85 |         r2 = []
 86 |         rl = []
 87 |         jacc = []
 88 |         bl = []
 89 |         met = []
 90 |         omin = []
 91 |         omax = []
 92 |         
 93 |         for i in progressbar.progressbar(range(len(files))):
 94 |             f = files[i]
 95 |             ref_text = ""
 96 |             for rank in range(Rank_initial, Rank_final+1, 1):
 97 |                 if(gold_exp[f][user]["exp"]["Rank" + str(rank)]!=""):
 98 |                     ref_text += gold_exp[f][user]["exp"]["Rank" + str(rank)] + " "
 99 |                 
100 |             machine_text = occ_exp[f]
101 |             machine_text = machine_text.lower()
102 |             ref_text = ref_text.lower()
103 |             
104 |             if(ref_text == ""):
105 |                 continue
106 |             rouge = Rouge()
107 |             score = rouge.get_scores(machine_text, ref_text)
108 |             r1.append(score[0]['rouge-1']['f'])
109 |             r2.append(score[0]['rouge-2']['f'])
110 |             rl.append(score[0]['rouge-l']['f'])
111 |             jacc.append(jaccard_similarity(ref_text, machine_text))
112 |             omin.append(overlap_coefficient_min(ref_text, machine_text))
113 |             omax.append(overlap_coefficient_max(ref_text, machine_text))
114 |             bl.append(get_BLEU_score(ref_text, machine_text))
115 |             met.append(nltk.translate.meteor_score.meteor_score([ref_text], machine_text))
116 |             
117 |         rouge1.append(np.mean(r1))
118 |         rouge2.append(np.mean(r2))
119 |         rougel.append(np.mean(rl))
120 |         jaccard.append(np.mean(jacc))
121 |         overlap_min.append(np.mean(omin))
122 |         overlap_max.append(np.mean(omax))
123 |         bleu.append(np.mean(bl))
124 |         meteor.append(np.mean(met))
125 |         
126 |     file_to_write.write("ROUGE-1 : {:}".format(rouge1) + "\n\n")
127 |     file_to_write.write("ROUGE-2 : {:}".format(rouge2) + "\n\n")
128 |     file_to_write.write("ROUGE-L : {:}".format(rougel)+ "\n\n")
129 |     file_to_write.write("Jaccard : {:}".format(jaccard)+ "\n\n")
130 |     file_to_write.write("Overmin : {:}".format(overlap_min)+ "\n\n")
131 |     file_to_write.write("Overmax : {:}".format(overlap_max)+ "\n\n")
132 |     file_to_write.write("BLEU    : {:}".format(bleu)+ "\n\n")
133 |     file_to_write.write("METEOR  : {:}".format(meteor)+ "\n\n") 
134 |             
135 | 
136 | 
137 | 
138 | experiments = [(1,10), (1,1), (2,2), (3,3,), (4,4), (5,5), (6,6), (7,7), (8,8), (9,9), (10,10), (1,5), (5,10)]
139 | for exp in experiments:
140 |     print(exp)
141 |     f = open("result_files/Rank_" + str(exp[0]) + "_to_" + str(exp[1]) + ".txt", "w")
142 |     occ_result_maker(f, exp[0], exp[1], occ_exp, gold_exp)
143 | 
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_10_to_10.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [nan, nan, nan, 0.03445149517649651, 0.15708265876264213]
 2 | 
 3 | ROUGE-2 : [nan, nan, nan, 0.01270033188918801, 0.09166365329030071]
 4 | 
 5 | ROUGE-L : [nan, nan, nan, 0.0380434765616509, 0.18298345049351192]
 6 | 
 7 | Jaccard : [nan, nan, nan, 0.02364394993045897, 0.12035711145444725]
 8 | 
 9 | Overmin : [nan, nan, nan, 0.2698412698412698, 0.5815615835777126]
10 | 
11 | Overmax : [nan, nan, nan, 0.02526002971768202, 0.1324182763744428]
12 | 
13 | BLEU    : [nan, nan, nan, 0.01132414506183412, 0.06985846019165107]
14 | 
15 | METEOR  : [nan, nan, nan, 0.06413246268656717, 0.24679374774483653]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_1_to_1.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [0.2994079837753364, 0.3401171990266999, 0.5020293607999134, 0.2599273220300319, 0.23972320202605527]
 2 | 
 3 | ROUGE-2 : [0.15350285289780233, 0.1840629854201761, 0.2967417120072472, 0.12687477876614914, 0.11715280563775633]
 4 | 
 5 | ROUGE-L : [0.26560174635742684, 0.3011282822401636, 0.4013145705101809, 0.22608537796908681, 0.22921167817285454]
 6 | 
 7 | Jaccard : [0.19983252368077029, 0.22155909610269411, 0.3173257732839313, 0.16971944138117856, 0.16556272801452307]
 8 | 
 9 | Overmin : [0.5878965417859003, 0.5906412802110796, 0.6092618617419718, 0.5429151955022612, 0.5886222288156457]
10 | 
11 | Overmax : [0.24577479849568715, 0.274019322961028, 0.4084659055515817, 0.20796097635613348, 0.1959603194302247]
12 | 
13 | BLEU    : [0.14950455026551707, 0.17856431048405882, 0.2657681947747096, 0.12603258128679604, 0.10658159841008764]
14 | 
15 | METEOR  : [0.30630770837600824, 0.35012497438909423, 0.3133660268613169, 0.27237982309035663, 0.2928280855710654]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_1_to_10.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [0.4440211066200134, 0.5167341091606427, 0.40141773671867104, 0.39072717646796995, 0.5008305754567868]
 2 | 
 3 | ROUGE-2 : [0.30326689584374666, 0.29529259617415965, 0.2964563966491299, 0.29648503190860837, 0.2937383790565076]
 4 | 
 5 | ROUGE-L : [0.43916912171050015, 0.4072227672357733, 0.4230512791342282, 0.44447383801471363, 0.40721621958398263]
 6 | 
 7 | Jaccard : [0.33268960776585504, 0.31695936860656587, 0.3278951049648254, 0.324123947111877, 0.31766649785263473]
 8 | 
 9 | Overmin : [0.7441138283611931, 0.5887096910494345, 0.8092717292233901, 0.8336820167079294, 0.6168974217094984]
10 | 
11 | Overmax : [0.38971886120207105, 0.41424963689706434, 0.35998970731533503, 0.35106813441965556, 0.40056833299703304]
12 | 
13 | BLEU    : [0.1597417204008955, 0.27948665292835273, 0.09921720009652026, 0.09308330395926168, 0.24830353240377137]
14 | 
15 | METEOR  : [0.2197752519781969, 0.30127181844463763, 0.1796085456542766, 0.17676207018861811, 0.2790961207989849]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_1_to_5.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [0.4479013483389395, 0.516775364103722, 0.40141773671867104, 0.3929503887128605, 0.5065848876691795]
 2 | 
 3 | ROUGE-2 : [0.30441088582251524, 0.2952063079756876, 0.2964563966491299, 0.2971917904269761, 0.2860316994679077]
 4 | 
 5 | ROUGE-L : [0.4396704330615302, 0.40695559931474407, 0.4230512791342282, 0.44388894556073727, 0.4005075169456302]
 6 | 
 7 | Jaccard : [0.33332999549989595, 0.31676517568246193, 0.3278951049648254, 0.3238231125560341, 0.3128300157255223]
 8 | 
 9 | Overmin : [0.7419692205180558, 0.5882055518008674, 0.8092717292233901, 0.8303338217840877, 0.5900441306717565]
10 | 
11 | Overmax : [0.39173890124004845, 0.41416686010833753, 0.35998970731533503, 0.35170060736959247, 0.40593116212255204]
12 | 
13 | BLEU    : [0.16357030874812212, 0.2795990362779476, 0.09921720009652026, 0.09567927088943153, 0.26851394868872575]
14 | 
15 | METEOR  : [0.2223257454763319, 0.3013508686100064, 0.1796085456542766, 0.17872939982273697, 0.29558919515976256]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_2_to_2.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [0.42672806650987516, 0.36154245709942606, 0.5124578892430357, 0.42484186037615423, 0.24354299669127596]
 2 | 
 3 | ROUGE-2 : [0.23209451862819883, 0.16920653510163125, 0.2667654142102132, 0.2305130479433321, 0.1230341735755373]
 4 | 
 5 | ROUGE-L : [0.35297598850932677, 0.2859474741964669, 0.36486592187139216, 0.34744047620943147, 0.23155218422158266]
 6 | 
 7 | Jaccard : [0.2697587729138904, 0.21739099215682162, 0.30211979836683905, 0.2653518310659205, 0.16605860741062736]
 8 | 
 9 | Overmin : [0.5776385536853391, 0.5275396165248144, 0.5408687024524034, 0.563396916621113, 0.5517557331512923]
10 | 
11 | Overmax : [0.343023609333886, 0.27891723979886235, 0.4114077999365115, 0.3468569088500664, 0.19778024126938404]
12 | 
13 | BLEU    : [0.2334339040558118, 0.18366046819705104, 0.2918194387307995, 0.23763557130667218, 0.11622686120113397]
14 | 
15 | METEOR  : [0.34802333317189943, 0.3247678686348737, 0.3111982819781351, 0.35003298478918365, 0.2923932254054842]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_3_to_3.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [0.42848138279369336, 0.3432733765215266, 0.3530260590360802, 0.49771932788571993, 0.3379052757863695]
 2 | 
 3 | ROUGE-2 : [0.2228252418458156, 0.1667461545050475, 0.16754775500841212, 0.27829604051979057, 0.1730344599879051]
 4 | 
 5 | ROUGE-L : [0.33123255509193295, 0.2807312738411141, 0.28160119860040916, 0.3977056334642953, 0.2920550704095285]
 6 | 
 7 | Jaccard : [0.25446958533623754, 0.2153784488122098, 0.21178656350524147, 0.3039241216636598, 0.2217792593309701]
 8 | 
 9 | Overmin : [0.5449614529190443, 0.5411213807250572, 0.5066233705135222, 0.5985090920461333, 0.551339111830478]
10 | 
11 | Overmax : [0.3309373293133875, 0.2717249252056011, 0.27732551412918066, 0.391318262842981, 0.2790361100394604]
12 | 
13 | BLEU    : [0.23209562075464396, 0.17722757450271093, 0.18048203363857382, 0.2547255097379439, 0.17943275331920247]
14 | 
15 | METEOR  : [0.3203243200816375, 0.31773919165735043, 0.2909715961677332, 0.29891989566961064, 0.3414080671011608]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_4_to_4.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [0.39494232184572337, 0.24891267425329447, 0.1843455535257677, 0.4688696167844332, 0.3231705049876493]
 2 | 
 3 | ROUGE-2 : [0.207736821841145, 0.13805718910969328, 0.06999833245875614, 0.2469625847755636, 0.15591103614648769]
 4 | 
 5 | ROUGE-L : [0.3278987479581329, 0.25772692669920183, 0.1646492836327949, 0.3636429301711796, 0.28071481644767643]
 6 | 
 7 | Jaccard : [0.2514099036594123, 0.18494549017401662, 0.12098317178596542, 0.28052770947783057, 0.21091506502730384]
 8 | 
 9 | Overmin : [0.5916271806154155, 0.6610049974248723, 0.4914470135877522, 0.5425876926196053, 0.5481415698934565]
10 | 
11 | Overmax : [0.3129158121245157, 0.2108385673384157, 0.14691691421018432, 0.37167364667836394, 0.2675196548315725]
12 | 
13 | BLEU    : [0.2021776481233977, 0.11907823966097168, 0.07672718029817406, 0.2593109633676457, 0.16170642449649011]
14 | 
15 | METEOR  : [0.312623555145515, 0.3017824408740791, 0.2094894492051845, 0.3267482997502233, 0.32501523795763504]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_5_to_10.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [0.44965353866239877, 0.16070177125720794, 0.029876226560277358, 0.2870115106853082, 0.347969492065285]
 2 | 
 3 | ROUGE-2 : [0.2304566997685214, 0.08498214346493514, 0.006834685540256092, 0.1370448466565096, 0.1604634124178717]
 4 | 
 5 | ROUGE-L : [0.3428992209500984, 0.1722111025911535, 0.05775075863046353, 0.2469837908704085, 0.2689067512787212]
 6 | 
 7 | Jaccard : [0.2644130401762261, 0.11825573079468554, 0.03930817610062893, 0.18475563485055974, 0.2079707839749613]
 8 | 
 9 | Overmin : [0.5327555755853299, 0.660773974698524, 0.5434782608695652, 0.4985367807362627, 0.5165732532774278]
10 | 
11 | Overmax : [0.3518345759414593, 0.13071156064746217, 0.04065040650406504, 0.2354515363006275, 0.2720194936370621]
12 | 
13 | BLEU    : [0.19696412427904703, 0.07082638377008542, 0.008102187273874817, 0.14174947035851404, 0.1791364338043909]
14 | 
15 | METEOR  : [0.2614177309896075, 0.22442814333980285, 0.07957693645411881, 0.2648542849262025, 0.30186284785226297]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_5_to_5.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [0.4525687687103372, 0.1542590438804973, 0.029876226560277358, 0.27530643271659777, 0.24800884174973617]
 2 | 
 3 | ROUGE-2 : [0.22648093791021356, 0.08140023201937187, 0.006834685540256092, 0.12967200468229742, 0.10656000291589704]
 4 | 
 5 | ROUGE-L : [0.3350354446913021, 0.16531169205336582, 0.05775075863046353, 0.2407498188280551, 0.21705234559305042]
 6 | 
 7 | Jaccard : [0.2580381969022034, 0.11203275636853825, 0.03930817610062893, 0.1780107070306969, 0.1640858997641607]
 8 | 
 9 | Overmin : [0.5299436215958133, 0.6603978651754309, 0.5434782608695652, 0.5180726803510725, 0.5397330231206089]
10 | 
11 | Overmax : [0.3419349129512632, 0.12229155074156817, 0.04065040650406504, 0.22693256750314608, 0.1996387816571787]
12 | 
13 | BLEU    : [0.20766051678993033, 0.06710522329318126, 0.008102187273874817, 0.13727174463055833, 0.10898556862848591]
14 | 
15 | METEOR  : [0.2793934135384032, 0.2208271663171727, 0.07957693645411881, 0.2646177446072684, 0.27606406878022083]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_6_to_6.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [0.48642894672702214, 0.05622732695209043, nan, 0.14346283549140906, 0.17064937503116137]
 2 | 
 3 | ROUGE-2 : [0.1854022327705856, 0.03206291517726667, nan, 0.07588435239288402, 0.08195517648550002]
 4 | 
 5 | ROUGE-L : [0.2601156020867721, 0.12217795282395873, nan, 0.15308798793105172, 0.17950383803531755]
 6 | 
 7 | Jaccard : [0.21015761821366025, 0.08273894436519258, nan, 0.1094601873393993, 0.12801973711611822]
 8 | 
 9 | Overmin : [0.42402826855123676, 0.6744186046511628, nan, 0.5416305527753607, 0.5902190086641533]
10 | 
11 | Overmax : [0.29411764705882354, 0.0861812778603269, nan, 0.12502248409172032, 0.14529110944524504]
12 | 
13 | BLEU    : [0.25443944336292407, 0.023403439508237603, nan, 0.06721678443670033, 0.07259928847103045]
14 | 
15 | METEOR  : [0.34068642688611045, 0.12831736879879074, nan, 0.19187715719340723, 0.2340890020768544]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_7_to_7.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [0.5031775652432486, nan, nan, 0.08330512810320766, 0.14869182094356392]
 2 | 
 3 | ROUGE-2 : [0.16311260270706693, nan, nan, 0.03848096668787078, 0.071635156137997]
 4 | 
 5 | ROUGE-L : [0.25965664744340483, nan, nan, 0.10069932190424496, 0.1671486514949222]
 6 | 
 7 | Jaccard : [0.2217741935483871, nan, nan, 0.06433987537524902, 0.11561122419170511]
 8 | 
 9 | Overmin : [0.40441176470588236, nan, nan, 0.4847159521284654, 0.5933095801745052]
10 | 
11 | Overmax : [0.32934131736526945, nan, nan, 0.06870857517597032, 0.12881654415623642]
12 | 
13 | BLEU    : [0.20384427508907552, nan, nan, 0.03059303715818409, 0.062194216001109436]
14 | 
15 | METEOR  : [0.26688130920058434, nan, nan, 0.1382135143489827, 0.2300308308469104]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_8_to_8.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [nan, nan, nan, nan, 0.1424875923597296]
 2 | 
 3 | ROUGE-2 : [nan, nan, nan, nan, 0.07191432834240229]
 4 | 
 5 | ROUGE-L : [nan, nan, nan, nan, 0.16497224231413846]
 6 | 
 7 | Jaccard : [nan, nan, nan, nan, 0.11669347808375624]
 8 | 
 9 | Overmin : [nan, nan, nan, nan, 0.6157205727885002]
10 | 
11 | Overmax : [nan, nan, nan, nan, 0.13084636425302795]
12 | 
13 | BLEU    : [nan, nan, nan, nan, 0.06431990111934095]
14 | 
15 | METEOR  : [nan, nan, nan, nan, 0.19972536370894084]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/result_files/Rank_9_to_9.txt:
--------------------------------------------------------------------------------
 1 | ROUGE-1 : [nan, nan, nan, nan, 0.20743520259011347]
 2 | 
 3 | ROUGE-2 : [nan, nan, nan, nan, 0.12754627726528367]
 4 | 
 5 | ROUGE-L : [nan, nan, nan, nan, 0.24089514102503254]
 6 | 
 7 | Jaccard : [nan, nan, nan, nan, 0.1695187516391293]
 8 | 
 9 | Overmin : [nan, nan, nan, nan, 0.672878836147393]
10 | 
11 | Overmax : [nan, nan, nan, nan, 0.19094864445388848]
12 | 
13 | BLEU    : [nan, nan, nan, nan, 0.10987875449695482]
14 | 
15 | METEOR  : [nan, nan, nan, nan, 0.23676168952208362]
16 | 
17 | 


--------------------------------------------------------------------------------
/Models/Explanations/metrics and results/xl_anno_make.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import json
  8 | g_json = open("gold_explanations.json", "r")
  9 | gold_exp = json.load(g_json)
 10 | 
 11 | 
 12 | # In[2]:
 13 | 
 14 | 
 15 | #!export PYTHONPATH=$PYTHONPATH:/opt/anaconda3/lib/python3.7/site-packages
 16 | 
 17 | 
 18 | # In[3]:
 19 | 
 20 | 
 21 | import nltk
 22 | from nltk.stem import PorterStemmer 
 23 | from nltk.tokenize import word_tokenize 
 24 | from rouge import Rouge 
 25 | import nltk.translate
 26 | from nltk.translate import meteor_score
 27 | import xlsxwriter
 28 | import progressbar
 29 | 
 30 | 
 31 | # In[3]:
 32 | 
 33 | 
 34 | files = list(gold_exp.keys())
 35 | 
 36 | 
 37 | # In[4]:
 38 | 
 39 | 
 40 | def get_BLEU_score(ref_text, machine_text):
 41 |     tok_ref_text = word_tokenize(ref_text)
 42 |     tok_machine_text = word_tokenize(machine_text)
 43 |     sc = nltk.translate.bleu_score.sentence_bleu([tok_ref_text], tok_machine_text)
 44 |     return sc
 45 | 
 46 | def jaccard_similarity(query, document):
 47 |     query = word_tokenize(query)
 48 |     document = word_tokenize(document)
 49 |     intersection = set(query).intersection(set(document))
 50 |     union = set(query).union(set(document))
 51 |     if(len(union)==0):
 52 |         return 0
 53 |     return len(intersection)/len(union)
 54 | 
 55 | def overlap_coefficient_min(query, document):
 56 |     query = word_tokenize(query)
 57 |     document = word_tokenize(document)
 58 |     intersection = set(query).intersection(set(document))
 59 |     den = min(len(set(query)),len(set(document)))
 60 |     if(den==0):
 61 |         return 0
 62 |     return len(intersection)/den
 63 | 
 64 | def overlap_coefficient_max(query, document):
 65 |     query = word_tokenize(query)
 66 |     document = word_tokenize(document)
 67 |     intersection = set(query).intersection(set(document))
 68 |     den = max(len(set(query)),len(set(document)))
 69 |     if(den==0):
 70 |         return 0
 71 |     return len(intersection)/den
 72 | 
 73 | 
 74 | # In[5]:
 75 | 
 76 | 
 77 | book = xlsxwriter.Workbook("anno_explanations_scores" + ".xlsx") #creating new xlsx file 
 78 | cell_format = book.add_format()
 79 | cell_format.set_text_wrap()
 80 | cell_format.set_align("top")
 81 | 
 82 | 
 83 | # In[6]:
 84 | 
 85 | 
 86 | def write_dummy_labels(sheet, row, column):
 87 |     sheet.write(row, column, "User 1", cell_format)
 88 |     sheet.write(row, column+1, "User 2", cell_format)
 89 |     sheet.write(row, column+2, "User 3", cell_format)
 90 |     sheet.write(row, column+3, "User 4", cell_format)
 91 |     sheet.write(row, column+4, "User 5", cell_format)
 92 |     sheet.write(row, column+5, "Jaccard", cell_format)
 93 |     sheet.write(row, column+6, "OverlapMin", cell_format)
 94 |     sheet.write(row, column+7, "OverlapMax", cell_format)
 95 |     sheet.write(row, column+8, "ROUGE-1", cell_format)
 96 |     sheet.write(row, column+9, "ROUGE-2", cell_format)
 97 |     sheet.write(row, column+10, "ROUGE-L", cell_format)
 98 |     sheet.write(row, column+11, "BLEU", cell_format)
 99 |     sheet.write(row, column+12, "METEOR", cell_format)
100 |     row=1
101 |     column=0
102 |     sheet.write(row, column, "Explanation", cell_format) #write rank labels on column 0
103 |     sheet.write(row+1, column, "Decision", cell_format) #11th row has the label decision
104 |     
105 | 
106 | 
107 | def metric_score(metric, machine_text, ref_text):
108 |     if(metric == "ROUGE-1"):
109 |         rouge = Rouge()
110 |         score = rouge.get_scores(machine_text, ref_text)
111 |         return score[0]['rouge-1']['f']
112 |     elif(metric == "ROUGE-2"):
113 |         rouge = Rouge()
114 |         score = rouge.get_scores(machine_text, ref_text)
115 |         return score[0]['rouge-2']['f']
116 |     elif(metric == "ROUGE-L"):
117 |         rouge = Rouge()
118 |         score = rouge.get_scores(machine_text, ref_text)
119 |         return score[0]['rouge-l']['f']
120 |     elif(metric == "Jaccard"):
121 |         return jaccard_similarity(ref_text, machine_text)
122 |     elif(metric == "Overmin"):
123 |         return overlap_coefficient_min(ref_text, machine_text)
124 |     elif(metric == "Overmax"):
125 |         return overlap_coefficient_max(ref_text, machine_text)
126 |     elif(metric == "BLEU"):
127 |         return get_BLEU_score(ref_text, machine_text)
128 |     elif(metric == "METEOR"):
129 |         return nltk.translate.meteor_score.meteor_score([ref_text], machine_text)
130 |     else:
131 |         print("Not a correct metric was given.")
132 |         return 0
133 | 
134 | 
135 | # In[7]:
136 | 
137 | 
138 | def writing_sheet_for_users(file_name, gold_exp, book):
139 |     sheet = book.add_worksheet() #add a sheet
140 |     sheet.set_column(0,0,15)
141 |     sheet.set_column(1,5,70)
142 |     sheet.set_column(6,13,40)
143 |     row, column = 0,0
144 |     sheet.write(row, column, file_name, cell_format)
145 |     column+=1
146 |     write_dummy_labels(sheet, row, column)
147 |     row,column=1,1
148 |     
149 |     for u in range(5):
150 |         explanation = gold_exp[file_name]["User " + str(u+1)]["exp"]
151 |         verdict = gold_exp[file_name]["User " + str(u+1)]["verdict"]
152 |         sheet.write(row,column,explanation, cell_format)
153 |         sheet.write(row+1,column,verdict,cell_format)
154 |         column+=1
155 |     
156 |     metrics = ["Jaccard", "Overmin", "Overmax", "ROUGE-1", "ROUGE-2", "ROUGE-L", "BLEU", "METEOR"]
157 |     
158 |     row=1
159 |     column=6
160 |     for i,metric in enumerate(metrics):
161 |         metric_string = ""
162 |         for u1 in range(1,6,1):
163 |             inistring = "User " + str(u1) + "vs User "
164 |             for u2 in range(1,6,1):
165 |                 if(u1==u2):
166 |                     continue
167 |                 score = metric_score(metric, gold_exp[file_name]["User " + str(u1)]["exp"], gold_exp[file_name]["User " + str(u2)]["exp"])
168 |                 rounded_score = round(score,4)
169 |                 finstring = inistring + str(u2) + " = " + str(rounded_score)
170 |                 metric_string += finstring + "\n\n"
171 |                 
172 |         sheet.write(row, column, metric_string, cell_format)
173 |         column+=1
174 | 
175 | 
176 | # In[8]:
177 | 
178 | 
179 | for i in progressbar.progressbar(range(len(files))):
180 |     writing_sheet_for_users(files[i], gold_exp, book)
181 | 
182 | # In[107]:
183 | 
184 | 
185 | book.close()
186 | 
187 | 
188 | # In[ ]:
189 | 
190 | 
191 | 
192 | 


--------------------------------------------------------------------------------
/Models/README.md:
--------------------------------------------------------------------------------
 1 | # Case Decision Prediction
 2 | 
 3 | ILDC documents are long and have specialized vocabulary compared to typical corpora used for training text classification models and language models. For predictiong the case decisions we experimented with various types of models:
 4 | - **Classical Models**
 5 | -  **Sequential Models**
 6 | - **CNN**
 7 | - **Transformers**
 8 | 
 9 | Please refer to each folder for more details on each model. Also the ipynb notebooks are annotated with relevant comments which will allow you to go through the notebook without much problem. Also the `Explanation` directory contains the code for the Occlusion algorithm which was used to get explanations for our model's output. 
10 | 


--------------------------------------------------------------------------------
/Models/Sequential_Models/BIGRU_final.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "accelerator": "GPU",
  6 |     "colab": {
  7 |       "name": "BIGRU_final.ipynb",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "display_name": "Python 3",
 13 |       "language": "python",
 14 |       "name": "python3"
 15 |     },
 16 |     "language_info": {
 17 |       "codemirror_mode": {
 18 |         "name": "ipython",
 19 |         "version": 3
 20 |       },
 21 |       "file_extension": ".py",
 22 |       "mimetype": "text/x-python",
 23 |       "name": "python",
 24 |       "nbconvert_exporter": "python",
 25 |       "pygments_lexer": "ipython3",
 26 |       "version": "3.7.3"
 27 |     }
 28 |   },
 29 |   "cells": [
 30 |     {
 31 |       "cell_type": "code",
 32 |       "metadata": {
 33 |         "colab_type": "code",
 34 |         "id": "EWcymo_FWADn",
 35 |         "colab": {
 36 |           "base_uri": "https://localhost:8080/",
 37 |           "height": 34
 38 |         },
 39 |         "outputId": "b83a2fef-d042-4ab9-ef19-4dd8cc81ec5c"
 40 |       },
 41 |       "source": [
 42 |         "import pandas as pd\n",
 43 |         "import numpy as np\n",
 44 |         "from keras import Sequential\n",
 45 |         "from keras.utils import Sequence\n",
 46 |         "from keras.layers import LSTM, Dense, Masking, GRU\n",
 47 |         "import numpy as np\n",
 48 |         "import keras\n",
 49 |         "from keras.utils import np_utils\n",
 50 |         "from keras import optimizers\n",
 51 |         "from keras.models import Sequential, Model\n",
 52 |         "from keras.layers import Embedding, Dense, Input, concatenate, Layer, Lambda, Dropout, Activation\n",
 53 |         "import datetime\n",
 54 |         "from datetime import datetime\n",
 55 |         "from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback, TensorBoard\n",
 56 |         "from keras.callbacks import ReduceLROnPlateau\n",
 57 |         "from keras.models import load_model\n",
 58 |         "import tensorflow as tf\n",
 59 |         "import tensorflow_hub as hub\n",
 60 |         "import numpy as np\n",
 61 |         "from numpy import load\n",
 62 |         "import pandas as pd\n",
 63 |         "from sklearn.metrics import accuracy_score\n",
 64 |         "from sklearn.metrics import confusion_matrix\n",
 65 |         "\n",
 66 |         "\n",
 67 |         "np.random.seed(1337)# setting the random seed value"
 68 |       ],
 69 |       "execution_count": null,
 70 |       "outputs": [
 71 |         {
 72 |           "output_type": "stream",
 73 |           "text": [
 74 |             "Using TensorFlow backend.\n"
 75 |           ],
 76 |           "name": "stderr"
 77 |         }
 78 |       ]
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "metadata": {
 83 |         "id": "8ZULOpIvF4Yt",
 84 |         "colab_type": "code",
 85 |         "colab": {
 86 |           "base_uri": "https://localhost:8080/",
 87 |           "height": 122
 88 |         },
 89 |         "outputId": "ef8c417f-0d2d-4b72-d380-b3cfb53c3297"
 90 |       },
 91 |       "source": [
 92 |         "# Mounting Drive\n",
 93 |         "from google.colab import drive\n",
 94 |         "drive.mount('/content/drive')"
 95 |       ],
 96 |       "execution_count": null,
 97 |       "outputs": [
 98 |         {
 99 |           "output_type": "stream",
100 |           "text": [
101 |             "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n",
102 |             "\n",
103 |             "Enter your authorization code:\n",
104 |             "··········\n",
105 |             "Mounted at /content/drive\n"
106 |           ],
107 |           "name": "stdout"
108 |         }
109 |       ]
110 |     },
111 |     {
112 |       "cell_type": "code",
113 |       "metadata": {
114 |         "id": "e2RgVj0jFrFp",
115 |         "colab_type": "code",
116 |         "colab": {}
117 |       },
118 |       "source": [
119 |         "path_dataset = \"drive/My Drive/dataset.csv\" # path to dataset"
120 |       ],
121 |       "execution_count": null,
122 |       "outputs": []
123 |     },
124 |     {
125 |       "cell_type": "code",
126 |       "metadata": {
127 |         "colab_type": "code",
128 |         "id": "fwfyU5uCRR0K",
129 |         "colab": {}
130 |       },
131 |       "source": [
132 |         "dataset = pd.read_csv(path_dataset) # loading dataset"
133 |       ],
134 |       "execution_count": null,
135 |       "outputs": []
136 |     },
137 |     {
138 |       "cell_type": "code",
139 |       "metadata": {
140 |         "colab_type": "code",
141 |         "id": "_1YqIPZ7L2K5",
142 |         "colab": {}
143 |       },
144 |       "source": [
145 |         "# path to transformer generated chunk embeddings eg. XLNet etc.\n",
146 |         "path_transformer_chunk_embeddings_train = 'drive/My Drive/XLNet/XLNet_train.npy' \n",
147 |         "path_transformer_chunk_embeddings_dev = 'drive/My Drive/XLNet/XLNet_dev.npy'\n",
148 |         "path_transformer_chunk_embeddings_test = 'drive/My Drive/XLNet/XLNet_test.npy'"
149 |       ],
150 |       "execution_count": null,
151 |       "outputs": []
152 |     },
153 |     {
154 |       "cell_type": "code",
155 |       "metadata": {
156 |         "colab_type": "code",
157 |         "id": "c5rkC-VzPPBa",
158 |         "colab": {}
159 |       },
160 |       "source": [
161 |         "# loading the chunk embeddings\n",
162 |         "x_train0 = load(path_transformer_chunk_embeddings_train, allow_pickle = True)\n",
163 |         "x_dev0 = load(path_transformer_chunk_embeddings_dev, allow_pickle= True)\n",
164 |         "x_test0 = load(path_transformer_chunk_embeddings_test, allow_pickle= True)"
165 |       ],
166 |       "execution_count": null,
167 |       "outputs": []
168 |     },
169 |     {
170 |       "cell_type": "code",
171 |       "metadata": {
172 |         "colab_type": "code",
173 |         "id": "-aVSH_kLQ3pM",
174 |         "colab": {}
175 |       },
176 |       "source": [
177 |         "# loading the corresponding label for each case in dataset\n",
178 |         "dev = dataset.loc[dataset['split'] == 'dev'] \n",
179 |         "train = dataset.loc[dataset['split'] == 'train'] \n",
180 |         "test = dataset.loc[dataset['split'] == 'test'] \n",
181 |         "\n",
182 |         "y_train0 = []\n",
183 |         "for i in range(train.shape[0]):\n",
184 |         "    y_train0.append(train.loc[i,'label'])  \n",
185 |         "    \n",
186 |         "y_dev0 = []\n",
187 |         "for i in range(dev.shape[0]):\n",
188 |         "    y_dev0.append(dev.loc[i+32305,'label'])\n",
189 |         "\n",
190 |         "y_test0 = []\n",
191 |         "for i in range(test.shape[0]):\n",
192 |         "    y_test0.append(test.loc[i+33299,'label'])"
193 |       ],
194 |       "execution_count": null,
195 |       "outputs": []
196 |     },
197 |     {
198 |       "cell_type": "code",
199 |       "metadata": {
200 |         "colab_type": "code",
201 |         "id": "PnMJO4OQcI5g",
202 |         "colab": {
203 |           "base_uri": "https://localhost:8080/",
204 |           "height": 357
205 |         },
206 |         "outputId": "45d6ec33-52b2-4571-da40-767ae20242fc"
207 |       },
208 |       "source": [
209 |         "from keras import layers\n",
210 |         "# Input layer to convert into required tensor shape\n",
211 |         "text_input = Input(shape=(None,768,), dtype='float32', name='text')\n",
212 |         "# Masking layer to mask the padded values\n",
213 |         "l_mask = layers.Masking(mask_value=-99.)(text_input)\n",
214 |         "# After masking we encoded the vector using 2 bidirectional GRU's\n",
215 |         "encoded_text = layers.Bidirectional(GRU(100,return_sequences=True))(l_mask)\n",
216 |         "encoded_text1 = layers.Bidirectional(GRU(100,))(encoded_text)\n",
217 |         "# Added a dense layer after encoding\n",
218 |         "out_dense = layers.Dense(30, activation='relu')(encoded_text1)\n",
219 |         "# And we add a sigmoid classifier on top\n",
220 |         "out = layers.Dense(1, activation='sigmoid')(out_dense)\n",
221 |         "# At model instantiation, we specify the input and the output:\n",
222 |         "model = Model(text_input, out)\n",
223 |         "model.compile(optimizer='Adam',\n",
224 |         "              loss='binary_crossentropy',\n",
225 |         "              metrics=['acc'])\n",
226 |         "model.summary()"
227 |       ],
228 |       "execution_count": null,
229 |       "outputs": [
230 |         {
231 |           "output_type": "stream",
232 |           "text": [
233 |             "Model: \"model_1\"\n",
234 |             "_________________________________________________________________\n",
235 |             "Layer (type)                 Output Shape              Param #   \n",
236 |             "=================================================================\n",
237 |             "text (InputLayer)            (None, None, 768)         0         \n",
238 |             "_________________________________________________________________\n",
239 |             "masking_1 (Masking)          (None, None, 768)         0         \n",
240 |             "_________________________________________________________________\n",
241 |             "bidirectional_1 (Bidirection (None, None, 200)         521400    \n",
242 |             "_________________________________________________________________\n",
243 |             "bidirectional_2 (Bidirection (None, 200)               180600    \n",
244 |             "_________________________________________________________________\n",
245 |             "dense_1 (Dense)              (None, 30)                6030      \n",
246 |             "_________________________________________________________________\n",
247 |             "dense_2 (Dense)              (None, 1)                 31        \n",
248 |             "=================================================================\n",
249 |             "Total params: 708,061\n",
250 |             "Trainable params: 708,061\n",
251 |             "Non-trainable params: 0\n",
252 |             "_________________________________________________________________\n"
253 |           ],
254 |           "name": "stdout"
255 |         }
256 |       ]
257 |     },
258 |     {
259 |       "cell_type": "code",
260 |       "metadata": {
261 |         "colab_type": "code",
262 |         "id": "YgVbg80acVar",
263 |         "colab": {}
264 |       },
265 |       "source": [
266 |         "num_sequences = len(x_train0)\n",
267 |         "batch_size = 32 \n",
268 |         "batches_per_epoch =  int(num_sequences/batch_size)\n",
269 |         "num_features= 768\n",
270 |         "def train_generator(): # function to generate batches of corresponding batch size\n",
271 |         "    x_list= x_train0\n",
272 |         "    y_list =  y_train0\n",
273 |         "    # Generate batches\n",
274 |         "    while True:\n",
275 |         "        for b in range(batches_per_epoch):\n",
276 |         "            longest_index = (b + 1) * batch_size - 1\n",
277 |         "            timesteps = len(max(x_train0[:(b + 1) * batch_size][-batch_size:], key=len))\n",
278 |         "            x_train = np.full((batch_size, timesteps, num_features), -99.)\n",
279 |         "            y_train = np.zeros((batch_size,  1))\n",
280 |         "            # padding the vectors with respect to the maximum sequence of each batch and not the whole training data\n",
281 |         "            for i in range(batch_size):\n",
282 |         "                li = b * batch_size + i\n",
283 |         "                x_train[i, 0:len(x_list[li]), :] = x_list[li]\n",
284 |         "                y_train[i] = y_list[li]\n",
285 |         "            yield x_train, y_train"
286 |       ],
287 |       "execution_count": null,
288 |       "outputs": []
289 |     },
290 |     {
291 |       "cell_type": "code",
292 |       "metadata": {
293 |         "colab_type": "code",
294 |         "id": "vnGZeO1ieiAQ",
295 |         "colab": {}
296 |       },
297 |       "source": [
298 |         "num_sequences_val = len(x_dev0)\n",
299 |         "batch_size_val = 32\n",
300 |         "batches_per_epoch_val = int(num_sequences_val/batch_size_val)\n",
301 |         "num_features= 768\n",
302 |         "def val_generator():# Similar function to generate validation batches\n",
303 |         "    x_list= x_dev0\n",
304 |         "    y_list =  y_dev0\n",
305 |         "    # Generate batches\n",
306 |         "    while True:\n",
307 |         "        for b in range(batches_per_epoch_val):\n",
308 |         "            longest_index = (b + 1) * batch_size_val - 1\n",
309 |         "            timesteps = len(max(x_dev0[:(b + 1) * batch_size_val][-batch_size_val:], key=len))\n",
310 |         "            x_train = np.full((batch_size_val, timesteps, num_features), 0)\n",
311 |         "            y_train = np.zeros((batch_size_val,  1))\n",
312 |         "            # padding the vectors with respect to the maximum sequence of each batch and not the whole validation data\n",
313 |         "            for i in range(batch_size_val):\n",
314 |         "                li = b * batch_size_val + i\n",
315 |         "                x_train[i, 0:len(x_list[li]), :] = x_list[li]\n",
316 |         "                y_train[i] = y_list[li]\n",
317 |         "            yield x_train, y_train"
318 |       ],
319 |       "execution_count": null,
320 |       "outputs": []
321 |     },
322 |     {
323 |       "cell_type": "code",
324 |       "metadata": {
325 |         "colab_type": "code",
326 |         "id": "MYZ7yr9mlYk_",
327 |         "colab": {
328 |           "base_uri": "https://localhost:8080/",
329 |           "height": 136
330 |         },
331 |         "outputId": "f35a0985-c74e-44b6-f3ad-9a0b61a513ab"
332 |       },
333 |       "source": [
334 |         "# Setting the callback and training the model\n",
335 |         "call_reduce = ReduceLROnPlateau(monitor='val_acc', factor=0.95, patience=2, verbose=2,\n",
336 |         "                                mode='auto', min_delta=0.01, cooldown=0, min_lr=0)\n",
337 |         "\n",
338 |         "model.fit_generator(train_generator(), steps_per_epoch=batches_per_epoch, epochs=3,\n",
339 |         "                    validation_data=val_generator(), validation_steps=batches_per_epoch_val, callbacks =[call_reduce] )"
340 |       ],
341 |       "execution_count": null,
342 |       "outputs": [
343 |         {
344 |           "output_type": "stream",
345 |           "text": [
346 |             "Epoch 1/3\n",
347 |             "1009/1009 [==============================] - 258s 256ms/step - loss: 0.4816 - acc: 0.7650 - val_loss: 0.6094 - val_acc: 0.7157\n",
348 |             "Epoch 2/3\n",
349 |             "1009/1009 [==============================] - 263s 261ms/step - loss: 0.4629 - acc: 0.7752 - val_loss: 0.5629 - val_acc: 0.7742\n",
350 |             "Epoch 3/3\n",
351 |             "1009/1009 [==============================] - 261s 259ms/step - loss: 0.4557 - acc: 0.7795 - val_loss: 0.5828 - val_acc: 0.7812\n"
352 |           ],
353 |           "name": "stdout"
354 |         },
355 |         {
356 |           "output_type": "execute_result",
357 |           "data": {
358 |             "text/plain": [
359 |               "<keras.callbacks.callbacks.History at 0x7fa532aa2f60>"
360 |             ]
361 |           },
362 |           "metadata": {
363 |             "tags": []
364 |           },
365 |           "execution_count": 15
366 |         }
367 |       ]
368 |     },
369 |     {
370 |       "cell_type": "code",
371 |       "metadata": {
372 |         "colab_type": "code",
373 |         "id": "Y3ET-FUAx8Su",
374 |         "colab": {}
375 |       },
376 |       "source": [
377 |         "def test_generator(): # function to generate batches of corresponding batch size\n",
378 |         "    x_list= x_test0\n",
379 |         "    y_list =  y_test0\n",
380 |         "    # Generate batches\n",
381 |         "    while True:\n",
382 |         "        for b in range(batches_per_epoch_test):\n",
383 |         "            if(b == batches_per_epoch_test-1): # An extra if else statement just to manage the last batch as it's size might not be equal to batch size \n",
384 |         "              longest_index = num_sequences_test - 1\n",
385 |         "              timesteps = len(max(x_test0[:longest_index + 1][-batch_size_test:], key=len))\n",
386 |         "              x_train = np.full((longest_index - b*batch_size_test, timesteps, num_features), -99.)\n",
387 |         "              y_train = np.zeros((longest_index - b*batch_size_test,  1))\n",
388 |         "              for i in range(longest_index - b*batch_size_test):\n",
389 |         "                  li = b * batch_size_test + i\n",
390 |         "                  x_train[i, 0:len(x_list[li]), :] = x_list[li]\n",
391 |         "                  y_train[i] = y_list[li]\n",
392 |         "            else:\n",
393 |         "                longest_index = (b + 1) * batch_size_test - 1\n",
394 |         "                timesteps = len(max(x_test0[:(b + 1) * batch_size_test][-batch_size_test:], key=len))\n",
395 |         "                x_train = np.full((batch_size_test, timesteps, num_features), -99.)\n",
396 |         "                y_train = np.zeros((batch_size_test,  1))\n",
397 |         "                # padding the vectors with respect to the maximum sequence of each batch and not the whole test data\n",
398 |         "                for i in range(batch_size_test):\n",
399 |         "                    li = b * batch_size_test + i\n",
400 |         "                    x_train[i, 0:len(x_list[li]), :] = x_list[li]\n",
401 |         "                    y_train[i] = y_list[li]\n",
402 |         "            yield x_train, y_train"
403 |       ],
404 |       "execution_count": null,
405 |       "outputs": []
406 |     },
407 |     {
408 |       "cell_type": "code",
409 |       "metadata": {
410 |         "colab_type": "code",
411 |         "id": "i-HzpAFHdD-t",
412 |         "colab": {
413 |           "base_uri": "https://localhost:8080/",
414 |           "height": 34
415 |         },
416 |         "outputId": "d66bf71d-5718-4091-b4cd-12435f5c7794"
417 |       },
418 |       "source": [
419 |         "num_sequences_test = len(x_test0)\n",
420 |         "batch_size_test = 32\n",
421 |         "batches_per_epoch_test = int(num_sequences_test/batch_size_test) + 1\n",
422 |         "num_features= 768\n",
423 |         "# evaluating on the test data\n",
424 |         "model.evaluate_generator(test_generator(), steps= batches_per_epoch_test)"
425 |       ],
426 |       "execution_count": null,
427 |       "outputs": [
428 |         {
429 |           "output_type": "execute_result",
430 |           "data": {
431 |             "text/plain": [
432 |               "[0.5188975930213928, 0.7671504020690918]"
433 |             ]
434 |           },
435 |           "metadata": {
436 |             "tags": []
437 |           },
438 |           "execution_count": 17
439 |         }
440 |       ]
441 |     },
442 |     {
443 |       "cell_type": "code",
444 |       "metadata": {
445 |         "id": "4bko5x4jKnpl",
446 |         "colab_type": "code",
447 |         "colab": {}
448 |       },
449 |       "source": [
450 |         "# defining a function which calculates various metrics such as micro and macro precision, accuracy and f1\n",
451 |         "def metrics_calculator(preds, test_labels):\n",
452 |         "    cm = confusion_matrix(test_labels, preds)\n",
453 |         "    TP = []\n",
454 |         "    FP = []\n",
455 |         "    FN = []\n",
456 |         "    for i in range(0,2):\n",
457 |         "        summ = 0\n",
458 |         "        for j in range(0,2):\n",
459 |         "            if(i!=j):\n",
460 |         "                summ=summ+cm[i][j]\n",
461 |         "\n",
462 |         "        FN.append(summ)\n",
463 |         "    for i in range(0,2):\n",
464 |         "        summ = 0\n",
465 |         "        for j in range(0,2):\n",
466 |         "            if(i!=j):\n",
467 |         "                summ=summ+cm[j][i]\n",
468 |         "\n",
469 |         "        FP.append(summ)\n",
470 |         "    for i in range(0,2):\n",
471 |         "        TP.append(cm[i][i])\n",
472 |         "    precision = []\n",
473 |         "    recall = []\n",
474 |         "    for i in range(0,2):\n",
475 |         "        precision.append(TP[i]/(TP[i] + FP[i]))\n",
476 |         "        recall.append(TP[i]/(TP[i] + FN[i]))\n",
477 |         "\n",
478 |         "    macro_precision = sum(precision)/2\n",
479 |         "    macro_recall = sum(recall)/2\n",
480 |         "    micro_precision = sum(TP)/(sum(TP) + sum(FP))\n",
481 |         "    micro_recall = sum(TP)/(sum(TP) + sum(FN))\n",
482 |         "    micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)\n",
483 |         "    macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)\n",
484 |         "    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1"
485 |       ],
486 |       "execution_count": null,
487 |       "outputs": []
488 |     },
489 |     {
490 |       "cell_type": "code",
491 |       "metadata": {
492 |         "colab_type": "code",
493 |         "id": "b49_aClD2TFO",
494 |         "colab": {
495 |           "base_uri": "https://localhost:8080/",
496 |           "height": 34
497 |         },
498 |         "outputId": "99d4646c-56e5-4111-dbbe-bb6e20fcf9cf"
499 |       },
500 |       "source": [
501 |         "# getting the predicted labels on the test data\n",
502 |         "preds = model.predict_generator(test_generator(), steps= batches_per_epoch_test)\n",
503 |         "y_pred = preds > 0.5\n",
504 |         "\n",
505 |         "# Calculating all metrics on test data predicted label\n",
506 |         "print(metrics_calculator(y_pred, y_test0[:-1]))"
507 |       ],
508 |       "execution_count": null,
509 |       "outputs": [
510 |         {
511 |           "output_type": "stream",
512 |           "text": [
513 |             "(0.7705673758865248, 0.7669248374829216, 0.7687417918381483, 0.7671503957783641, 0.7671503957783641, 0.7671503957783641)\n"
514 |           ],
515 |           "name": "stdout"
516 |         }
517 |       ]
518 |     },
519 |     {
520 |       "cell_type": "code",
521 |       "metadata": {
522 |         "colab_type": "code",
523 |         "id": "MLGnNSmqLahb",
524 |         "colab": {
525 |           "base_uri": "https://localhost:8080/",
526 |           "height": 34
527 |         },
528 |         "outputId": "eb209db8-22b6-4039-e691-078bb109b06b"
529 |       },
530 |       "source": [
531 |         "# getting the predicted labels on the dev data\n",
532 |         "preds = model.predict_generator(val_generator(), steps= batches_per_epoch_val)\n",
533 |         "y_pred_dev = preds > 0.5\n",
534 |         "\n",
535 |         "# Calculating all metrics on dev data predicted label\n",
536 |         "print(metrics_calculator(y_pred_dev, y_dev0[:-2]))"
537 |       ],
538 |       "execution_count": null,
539 |       "outputs": [
540 |         {
541 |           "output_type": "stream",
542 |           "text": [
543 |             "(0.7818560656979799, 0.78125, 0.7815529153535627, 0.78125, 0.78125, 0.78125)\n"
544 |           ],
545 |           "name": "stdout"
546 |         }
547 |       ]
548 |     },
549 |     {
550 |       "cell_type": "code",
551 |       "metadata": {
552 |         "colab_type": "code",
553 |         "id": "ITN1ODSfa61U",
554 |         "colab": {}
555 |       },
556 |       "source": [
557 |         "# saving the trained model\n",
558 |         "model.save('BIGRU_XLNet.h5')  # creates a HDF5 file 'BIGRU_XLNet.h5'"
559 |       ],
560 |       "execution_count": null,
561 |       "outputs": []
562 |     },
563 |     {
564 |       "cell_type": "code",
565 |       "metadata": {
566 |         "colab_type": "code",
567 |         "id": "NN5eSAN7R7As",
568 |         "colab": {}
569 |       },
570 |       "source": [
571 |         "# loading the model\n",
572 |         "# model = load_model('BIGRU_XLNet.h5')"
573 |       ],
574 |       "execution_count": null,
575 |       "outputs": []
576 |     }
577 |   ]
578 | }


--------------------------------------------------------------------------------
/Models/Sequential_Models/CatchPhrase/README.md:
--------------------------------------------------------------------------------
 1 | ## Link of the Dataset ##
 2 | The **CatchPhrase** dataset can be found [here](https://drive.google.com/drive/folders/1xY0b5WgFSgcMFcMM1BGicBSvof-7iGeP?usp=sharing).
 3 | 
 4 | ### Description of CatchPhrase Dataset ###
 5 | 
 6 | * Firstly, we extract the noun phrases using the paper [Automatic Catchphrase Identification from Legal Court Case Documents, by A Mandal, K Ghosh, A Pal, S Ghosh at CIKM, 2017](https://dl.acm.org/doi/10.1145/3132847.3133102) and Github repo of [NNP-extractor](https://github.com/amarnamarpan/NNP-extractor-a-highly-customisable-noun-phrase-extraction-module). 
 7 | 
 8 | * After we get the noun phrases we passes for the score that phrase, sentence or word using the above mentioned paper and Github repo of [PSLEGAL - An unsupervised way of legal catchphrase extraction](https://github.com/amarnamarpan/PSLEGAL-An-unsupervised-way-of-legal-catchphrase-extraction).
 9 | 
10 | * Using the scored phrases for each documents first we sort the phrases according to the score.
11 | 
12 | * We extract the sentences using the top scored phrases corresponding to that documents.
13 | 
14 | * Using the NLTK sentence tokenizer we get 200 sentences covered 90% of the data in catch-phrase dataset, so we extracted only top 200 sentences form the corresponding documents.
15 | 
16 | ### Embeddings ###
17 | 
18 | We have used two types of embeddings with classical models: **Sent2Vec** and **Doc2Vec**. Both of these models were trained on the train set of **CatchPhrase** dataset. We are not releasing the trained models of these but we are releasing the mapped vectors from sentences (in case of sent2vec) and documents (in case of doc2vec) as we believe these are more convenient to use. If you want to train your own models you can use the train set of **CatchPhrase** dataset to train the embeddings. We used the following codebase for reference:
19 | * For sent2vec. [(here)](https://ilmoirfan.com/how-to-train-sent2vec-model/)
20 | * For doc2vec. [(here)](https://radimrehurek.com/gensim/models/doc2vec.html)
21 | 
22 | #### Sent2Vec embeddings of CatchPhrase dataset: ####
23 | 
24 | **Config**: mincount = 5, Vocab size = 750000, n-gram = 2, dimension size = 200. You can find the numpy files [here](https://drive.google.com/drive/folders/1vPE5GZdIVsLNVZFCR5DJBuaucUFi4IZQ?usp=sharing)
25 | 
26 | #### Doc2Vec embeddings of CatchPhrase dataset: ####
27 | 
28 | **Config**: dimension size = 500. You can find the numpy files [here](https://drive.google.com/drive/folders/1vPE5GZdIVsLNVZFCR5DJBuaucUFi4IZQ?usp=sharing)
29 | 
30 | #### For reproducing results: ####
31 | * Sent2Vec: You will need the numpy files for sent2vec mentioned above. You can then run the following command:
32 | ```
33 | python classical_models_sent2vec_avgd.py path_to/train_set_npy_file path_to/test_set_npy_file path/to/dev_set_npy_file
34 | ```
35 | * Doc2Vec: You will need the numpy files for Doc2Vec mentioned above (Use any one configuration). You can then run the following command:
36 | ```
37 | python classical_models_doc2vec.py path_to/train_set_npy_file path_to_train_set_labels_file path_to/test_set_npy_file path_to/test_set_labels_file path_to/dev_set_npy_file path/to/dev_set_labels_file
38 | ```
39 | 
40 | ### Models ###
41 | We run two sequential models on the catch-phrase dataset:
42 | * BiGRU 
43 |   * Sent2Vec embeddings of CatchPhrase dataset
44 |   * Doc2Vec embeddings of CatchPhrase dataset
45 | * BiGRu with attention
46 |   * Sent2Vec embeddings of CatchPhrase dataset
47 |   * Doc2Vec embeddings of CatchPhrase dataset
48 | 


--------------------------------------------------------------------------------
/Models/Sequential_Models/README.md:
--------------------------------------------------------------------------------
 1 | ## Sequential Models
 2 | 
 3 | 
 4 | ### BIGRU
 5 | 
 6 | We used BIGRU with four types of inputs:
 7 |  - **Sent2Vec Embedding**(Embedding Dim: 200). For Doc2Vec embeddings numpy file please refer to *CJPE/Classical Models/README.md*
 8 |  - **GloVe embeddings**( last 512 tokens of case tokenized with nltk tokenizer, Embedding Dim = 180). GloVe Model: [link](https://drive.google.com/drive/folders/1vNcOo4e8-qEhWQosJCND6g6_lTsZdOua) 
 9 |  - **Doc2Vec Embeddings**(applied for every 500 token chunk length, tokenized with nltk tokenizer, Embedding Dim: 1000). For Sent2Vec embeddings numpy file please refer to *CJPE/Classical Models/README.md*
10 |  - **Transformers Chunk Embeddings**(eg. BERT, XLNet, RoBERTa.) with every chunk Embedding Dim: 768(except concatenated one). For more details please refer to *CJPE/transformers/* folder.
11 | >  **Note1**: For all the above we used both *ILDC<sub>single</sub>*   and *ILDC<sub>multi</sub>* dataset. 
12 | 
13 | > **Note2**: For 4<sup>th</sup> point, the entire model with Transformers are called Hierarchical Model.
14 | 
15 | ### BIGRU with Attention
16 | 
17 | The inputs are similar to BIGRU just we are using an **Attention Layer** on top of the BIGRU so as to assign weigths to each chunk depending on their importance in label prediction.
18 | 
19 | ### HAN
20 | 
21 | We use both the datasets *ILDC<sub>multi</sub>* and *ILDC<sub>single</sub>* datasets. 
22 | 
23 | **Input**: For input we use the GloVe Embeddings( Embedding Dim: 180) to train the model. We take the last 40 sentences(atmost) with 50 tokens(atmost) for each sentence, used the NLTK tokenizer to tokenize the data.
24 | GloVe model: [link](https://drive.google.com/drive/folders/1vNcOo4e8-qEhWQosJCND6g6_lTsZdOua) 
25 | 


--------------------------------------------------------------------------------
/Models/transformers/concatenated/README.md:
--------------------------------------------------------------------------------
 1 | ### For "Hierarchical Concatenated model trained on ILDC<sub>single</sub>" ###
 2 | 
 3 | We used the XLNet model trained on the ILDC<sub>single</sub> train set. For more information about the fine-tuning of this transformer you can check out the readme file in the path: CJPE/transformers/trained_on_single.
 4 | 
 5 | Using the fine-tuned model we extracted out the [CLS] embeddings of the last 4 hidden layers and concatenated them into a single 768 x 4 dimension embedding.
 6 | The file **concat_XLNet_embeddings_maker.ipynb** demonstrates the task of doing the same by loading the pretrained model and then saving the concatenated embeddings of last 4 hidden states.
 7 | For ease in reproducibility we have saved the numpy files on google drive [here](https://drive.google.com/drive/folders/1tGDa8Jm4r3C5Cs_BuAvn-S3dH66e4cP7?usp=sharing).
 8 | 
 9 | **Note:** Train set has been split into 3 files 1GB each due to their huge size.
10 | 
11 | You can use the file **XLNet_full_concat_results.ipynb** file to reproduce the results. You will need the ILDC dataset files and the saved numpy files given above.
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/Models/transformers/concatenated/concat_XLNet_embeddings_maker.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "colab": {
  8 |      "base_uri": "https://localhost:8080/",
  9 |      "height": 34
 10 |     },
 11 |     "colab_type": "code",
 12 |     "id": "mG41G9dM8z6L",
 13 |     "outputId": "d161f77b-5819-4501-aba3-86447c2888b4"
 14 |    },
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "!pip install transformers\n",
 18 |     "import os\n",
 19 |     "import random\n",
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "import csv\n",
 23 |     "import tensorflow as tf\n",
 24 |     "import torch\n",
 25 |     "from sklearn.model_selection import train_test_split\n",
 26 |     "from google.colab import drive\n",
 27 |     "import textwrap\n",
 28 |     "import progressbar\n",
 29 |     "import keras\n",
 30 |     "from keras.preprocessing.sequence import pad_sequences\n",
 31 |     "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n",
 32 |     "from transformers import BertForSequenceClassification, AdamW, BertConfig\n",
 33 |     "from transformers import get_linear_schedule_with_warmup\n",
 34 |     "import time\n",
 35 |     "import datetime\n",
 36 |     "import json"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {
 43 |     "colab": {},
 44 |     "colab_type": "code",
 45 |     "id": "gOQ2YDd-87WS"
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "drive.mount(\"/content/Drive/\")"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {
 56 |     "colab": {},
 57 |     "colab_type": "code",
 58 |     "id": "yyEd_uKF8-Dj"
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "df = pd.read_csv('/content/Drive/My Drive/LNLP/dataset.csv') # path to multi dataset\n",
 63 |     "train_set = df.query(\" split=='train' \")\n",
 64 |     "test_set = df.query(\" split=='test' \")\n",
 65 |     "validation_set = df.query(\" split=='dev' \")"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {
 72 |     "colab": {},
 73 |     "colab_type": "code",
 74 |     "id": "jEieUV-l9MIj"
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig\n",
 79 |     "from transformers import BertForSequenceClassification, BertTokenizer, BertConfig\n",
 80 |     "from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig\n",
 81 |     "from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig\n",
 82 |     "from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig\n",
 83 |     "from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig\n",
 84 |     "\n",
 85 |     "MODEL_CLASSES = {\n",
 86 |     "    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),\n",
 87 |     "    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),\n",
 88 |     "    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),\n",
 89 |     "    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),\n",
 90 |     "    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)}\n",
 91 |     "\n",
 92 |     "model_type = 'xlnet' ###--> CHANGE WHAT MODEL YOU WANT HERE!!! <--###\n",
 93 |     "model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]\n",
 94 |     "model_name = 'xlnet-base-cased'"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {
101 |     "colab": {
102 |      "base_uri": "https://localhost:8080/",
103 |      "height": 1000
104 |     },
105 |     "colab_type": "code",
106 |     "id": "gHCyZeBm9OnR",
107 |     "outputId": "09e8b5cd-a64d-4e6b-d28a-be2425c5e2e5"
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
112 |     "output_dir = \"/content/Drive/My Drive/mini_XLNet/\"\n",
113 |     "tokenizer = XLNetTokenizer.from_pretrained(output_dir)\n",
114 |     "model = XLNetForSequenceClassification.from_pretrained(output_dir, output_hidden_states=True)\n",
115 |     "model.to(device)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {
122 |     "colab": {},
123 |     "colab_type": "code",
124 |     "id": "2swrOWVF-MKu"
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "def att_masking(input_ids):\n",
129 |     "  attention_masks = []\n",
130 |     "  for sent in input_ids:\n",
131 |     "    att_mask = [int(token_id > 0) for token_id in sent]\n",
132 |     "    attention_masks.append(att_mask)\n",
133 |     "  return attention_masks\n",
134 |     "\n",
135 |     "def grouped_input_ids(all_toks):\n",
136 |     "  splitted_toks = []\n",
137 |     "  l=0\n",
138 |     "  r=510\n",
139 |     "  while(l<len(all_toks)):\n",
140 |     "    splitted_toks.append(all_toks[l:min(r,len(all_toks))])\n",
141 |     "    l+=410\n",
142 |     "    r+=410\n",
143 |     "\n",
144 |     "  CLS = tokenizer.cls_token\n",
145 |     "  SEP = tokenizer.sep_token\n",
146 |     "  e_sents = []\n",
147 |     "  for l_t in splitted_toks:\n",
148 |     "    l_t = l_t + [SEP] + [CLS]\n",
149 |     "    encoded_sent = tokenizer.convert_tokens_to_ids(l_t)\n",
150 |     "    e_sents.append(encoded_sent)\n",
151 |     "\n",
152 |     "  e_sents = pad_sequences(e_sents, maxlen=512, value=0, dtype=\"long\", padding=\"pre\")\n",
153 |     "  att_masks = att_masking(e_sents)\n",
154 |     "  return e_sents, att_masks\n",
155 |     "\n",
156 |     "def get_output_for_one_vec(input_id, att_mask):\n",
157 |     "  input_ids = torch.tensor(input_id)\n",
158 |     "  att_masks = torch.tensor(att_mask)\n",
159 |     "  input_ids = input_ids.unsqueeze(0)\n",
160 |     "  att_masks = att_masks.unsqueeze(0)\n",
161 |     "  model.eval()\n",
162 |     "  input_ids = input_ids.to(device)\n",
163 |     "  att_masks = att_masks.to(device)\n",
164 |     "  with torch.no_grad():\n",
165 |     "      logits, encoded_layers = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)\n",
166 |     "\n",
167 |     "  vec = torch.cat((encoded_layers[12][0][-1], encoded_layers[11][0][-1], encoded_layers[10][0][-1], encoded_layers[9][0][-1]), dim=0)\n",
168 |     "  vec = vec.detach().cpu().numpy()\n",
169 |     "  return vec\n",
170 |     "\n",
171 |     "def generate_np_files_for_emb(dataf, tokenizer):\n",
172 |     "  all_docs = []\n",
173 |     "  for i in progressbar.progressbar(range(len(dataf['text']))):\n",
174 |     "    text = dataf['text'].iloc[i]\n",
175 |     "    toks = tokenizer.tokenize(text, add_prefix_space=True)\n",
176 |     "    if(len(toks) > 10000):\n",
177 |     "      toks = toks[len(toks)-10000:]\n",
178 |     "\n",
179 |     "    splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)\n",
180 |     "\n",
181 |     "    vecs = []\n",
182 |     "    for index,ii in enumerate(splitted_input_ids):\n",
183 |     "      vecs.append(get_output_for_one_vec(ii, splitted_att_masks[index]))\n",
184 |     " \n",
185 |     "    one_doc = np.asarray(vecs)\n",
186 |     "    all_docs.append(one_doc)\n",
187 |     "  \n",
188 |     "\n",
189 |     "  all_docs = np.asarray(all_docs)\n",
190 |     "  return all_docs"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {
197 |     "colab": {},
198 |     "colab_type": "code",
199 |     "id": "9mmuAemf_j5y"
200 |    },
201 |    "outputs": [],
202 |    "source": [
203 |     "vecs_dev = generate_np_files_for_emb(validation_set, tokenizer)\n",
204 |     "np.save(\"/content/Drive/My Drive/LNLP/Hierarchical/XLNet_full_concat/XLNet_dev.npy\", vecs_dev)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "colab": {},
212 |     "colab_type": "code",
213 |     "id": "Evhxb371_qwb"
214 |    },
215 |    "outputs": [],
216 |    "source": [
217 |     "train_set_1 = train_set.iloc[:10000]\n",
218 |     "train_set_2 = train_set.iloc[10000:20000]\n",
219 |     "train_set_3 = train_set.iloc[20000:]"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {
226 |     "colab": {
227 |      "base_uri": "https://localhost:8080/",
228 |      "height": 34
229 |     },
230 |     "colab_type": "code",
231 |     "id": "GgEPrgz9B7Pr",
232 |     "outputId": "2281d57f-9af1-4070-9f29-f54ac5bc8467"
233 |    },
234 |    "outputs": [],
235 |    "source": [
236 |     "vecs_test = generate_np_files_for_emb(test_set, tokenizer)\n",
237 |     "np.save(\"/content/Drive/My Drive/LNLP/Hierarchical/XLNet_full_concat/XLNet_test.npy\", vecs_test)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "colab": {
245 |      "base_uri": "https://localhost:8080/",
246 |      "height": 34
247 |     },
248 |     "colab_type": "code",
249 |     "id": "tTDe7bmlB-8O",
250 |     "outputId": "aba8532b-0846-43a5-8662-76768e965dcf"
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "vecs_train_1 = generate_np_files_for_emb(train_set_1, tokenizer)\n",
255 |     "np.save(\"/content/Drive/My Drive/LNLP/Hierarchical/XLNet_full_concat/XLNet_train_1.npy\", vecs_train_1)"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {
262 |     "colab": {
263 |      "base_uri": "https://localhost:8080/",
264 |      "height": 34
265 |     },
266 |     "colab_type": "code",
267 |     "id": "7ep0_HMqUCXw",
268 |     "outputId": "aab9d0aa-ead6-4806-a463-f8628caa97ee"
269 |    },
270 |    "outputs": [],
271 |    "source": [
272 |     "vecs_train_2 = generate_np_files_for_emb(train_set_2, tokenizer)\n",
273 |     "np.save(\"/content/Drive/My Drive/LNLP/Hierarchical/XLNet_full_concat/XLNet_train_2.npy\", vecs_train_2)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {
280 |     "colab": {
281 |      "base_uri": "https://localhost:8080/",
282 |      "height": 34
283 |     },
284 |     "colab_type": "code",
285 |     "id": "PQDfbm5LUCi8",
286 |     "outputId": "d2eebab5-c446-4d0c-a0a3-7a8dde125ec1"
287 |    },
288 |    "outputs": [],
289 |    "source": [
290 |     "vecs_train_3 = generate_np_files_for_emb(train_set_3, tokenizer)\n",
291 |     "np.save(\"/content/Drive/My Drive/LNLP/Hierarchical/XLNet_full_concat/XLNet_train_3.npy\", vecs_train_3)"
292 |    ]
293 |   }
294 |  ],
295 |  "metadata": {
296 |   "accelerator": "GPU",
297 |   "colab": {
298 |    "name": "concat_XLNet_embeddings_maker.ipynb",
299 |    "provenance": []
300 |   },
301 |   "kernelspec": {
302 |    "display_name": "Python 3",
303 |    "language": "python",
304 |    "name": "python3"
305 |   },
306 |   "language_info": {
307 |    "codemirror_mode": {
308 |     "name": "ipython",
309 |     "version": 3
310 |    },
311 |    "file_extension": ".py",
312 |    "mimetype": "text/x-python",
313 |    "name": "python",
314 |    "nbconvert_exporter": "python",
315 |    "pygments_lexer": "ipython3",
316 |    "version": "3.7.3"
317 |   }
318 |  },
319 |  "nbformat": 4,
320 |  "nbformat_minor": 1
321 | }
322 | 


--------------------------------------------------------------------------------
/Models/transformers/trained_on_multi/README.md:
--------------------------------------------------------------------------------
 1 | ### Fine-tuned transformers on ILDC<sub>multi</sub>
 2 | 
 3 | We used the SequenceClassification models' codebase from Chris Mccormick's tutorials [here](https://mccormickml.com/)
 4 | 
 5 | In the paper we mentioned that for BERT we tried out multiple combinations of input tokens. The notebook provided here for BERT has only the last 512 tokens implementaion. We encourage the reader to change the code (only a slight change is needed in the function input_id_maker) as it is really straightforward. The other transformer models were fine-tuned using the last 512 tokens only.
 6 | 
 7 | You can find our fine-tuned models from here: (all saved models were trained on last 512 tokens only)
 8 | 
 9 | Model      | Performance (Accuracy) | link |
10 | -----------|------------------------|------|
11 | DistilBERT |  64.21%                |  [here](https://drive.google.com/drive/folders/1-bdRjxo0l6rItR5jsGWnYE0p7iZanB_E)    |
12 | BERT       |  67.24%                |  [here](https://drive.google.com/drive/folders/17nddWo9e4Z-rljF83jIq1aEb3w71DouZ?usp=sharing)|
13 | RoBERTa    |  71.26%                |  [here](https://drive.google.com/drive/folders/10CuJ9p2MNcRAwM87bNXisS07_smi97qX?usp=sharing)  |
14 | XLNet      |  70.01%                |  [here](https://drive.google.com/drive/folders/1Pn3uaPz6PNFaBo9IrtuJpC2PNpDMDXhS?usp=sharing)  |
15 | 
16 | **Note1**: These transformers are also used in the Voting Ensemble method as well.
17 | 
18 | **Note2**: If you want to know more about the training hyperparameters we used, please look at section A of appendix in the paper.
19 | 
20 | ### For Hierarchical models using ILDC<sub>multi</sub>
21 | 
22 | For easy reproducibilty, we also make available the [CLS] token embeddings. You can easily reproduce the hierarchical model results (transformer trained on ILDC<sub>multi</sub> using these embeddings and training a BiGRU model on these as in the directory of Sequential_Models. We have made available the code for both attention included and without attention models of Bidirectional GRUs.
23 | 
24 | You can access the saved [CLS] embeddings from here (except DistilBERT)
25 | 
26 | Model      | link |
27 | -----------|-------|
28 | BERT       | [here](https://drive.google.com/drive/folders/1bYrsn8JEu_C2ExWBQnqUPmApCn-is8IL?usp=sharing)|
29 | RoBERTa    | [here](https://drive.google.com/drive/folders/1b_9j1XoXpdvGEOnwahvIcSDz5SzVLvJ9?usp=sharing)  |
30 | XLNet      | [here](https://drive.google.com/drive/folders/1TXQctks4_8mPuBUrHC7mvDxhjs4Lxab2?usp=sharing)  |
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/Models/transformers/trained_on_single/README.md:
--------------------------------------------------------------------------------
 1 | ### Fine-tuned transformers on ILDC<sub>single</sub>
 2 | 
 3 | We used the SequenceClassification models' codebase from Chris Mccormick's tutorials [here](https://mccormickml.com/)
 4 | 
 5 | The fine-tuning of these transformers was done on ILDC<sub>single</sub> in the following steps:
 6 | * Load the ILDC<sub>single</sub> data.
 7 | * For each document divide it into chunks of 512 tokens each with an overlap of 100 tokens with neighbouring chunk.
 8 | * Give each chunk the same label as the whole document.
 9 | * Now fine-tune the transformer model taking each chunk as a separate input in training.
10 | 
11 | You can find our fine-tuned models from here: (all saved models were trained using the above strategy)
12 | 
13 | Model      |  link |
14 | -----------|-------|
15 | BERT       | [here](https://drive.google.com/drive/folders/1TD2FQix8_gIOXiV2rTgbCU7lfrplH1HM?usp=sharing)|
16 | RoBERTa    | [here](https://drive.google.com/drive/folders/1-u66E41nlGowY4RFAVCBR9iMkDL8zB80?usp=sharing)  |
17 | XLNet      | [here](https://drive.google.com/drive/folders/1d8p1StTObDzQoc_X2hEs7nGITfxa6L9I?usp=sharing)  |
18 | 
19 | **Note**: If you want to know more about the training hyperparameters we used, please look at section A of appendix in the paper.
20 | 
21 | ### For Hierarchical models using ILDC<sub>single</sub>
22 | 
23 | Please remember that we fine-tuned the transformer on ILDC<sub>single</sub> using the above strategy. However, for the training of BiGRU or other models on [CLS] embedding features we used the fine-tuned model to *get the [CLS] embeddings of each chunk of every document in ILDC<sub>multi</sub>*
24 | 
25 | For easy reproducibilty, we also make available the [CLS] token embeddings. You can easily reproduce the hierarchical model results (transformer trained on ILDC<sub>single</sub>) using these embeddings and training a BiGRU model on these as in the directory of Sequential_Models. We have made available the code for both attention included and without attention models of Bidirectional GRUs.
26 | 
27 | You can access the saved [CLS] embeddings from here:
28 | 
29 | Model      | link |
30 | -----------|-------|
31 | BERT       | [here](https://drive.google.com/drive/folders/1g4di6WHAnKPoUl8gQ8YcQUOILBmmy1q7?usp=sharing)|
32 | RoBERTa    | [here](https://drive.google.com/drive/folders/1NF5AGgztp29comv4HMABcRwowP4Kf0v5?usp=sharing)  |
33 | XLNet      | [here](https://drive.google.com/drive/folders/1aYH_dbe7YIiqZ6ULR_OVvXBlbKa_PyA5?usp=sharing)  |
34 | 


--------------------------------------------------------------------------------
/Models/transformers/voting ensemble/README.md:
--------------------------------------------------------------------------------
1 | ## Voting Ensemble ##
2 | 
3 | We used only RoBERTa and XLNet for voting ensemble techniques as they had the best results in individual transformers results. Steps for getting the results in Voting Ensemble method:
4 | * Load the fine-tuned model (we used the transformers that were trained on ILDC<sub>multi</sub> last 512 tokens only).
5 | * Divide each document into chunks of 512 tokens each with an overlap of 100 tokens with the neighbouring chunks.
6 | * Use the fine-tuned model to get the label of each chunk.
7 | * The label for the whole document was determined by majority out of each chunk label of the document.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CJPE (Court Judgment Prediction and Explanation)
 2 | Court Judgment Prediction and Explanation (Paper: <https://aclanthology.org/2021.acl-long.313/>)
 3 | 
 4 | The repository contains the full codebase of experiments and results of the ACL-IJCNLP 2021 paper "ILDC for CJPE: Indian Legal Documents Corpus for Court Judgment Prediction and Explanation". 
 5 | 
 6 | You can get ILDC dataset in the Dataset folder.
 7 | 
 8 | Our contributions can be summarized as below:
 9 | * We introduce a new task – Court Judgment Prediction and Explanation (CJPE) with the following sub-tasks: Court Judgment Prediction and Explanation of the Prediction.
10 | * We create a new corpus (Indian Legal Documents Corpus (ILDC) annotated with court decisions and corresponding explanations.
11 | * We develop a battery of methods to solve the task. We perform an extensive experimentation with state of the art machine learning algorithms for the judgement prediction task. We develop a new method for explaining machine predictions since none of the existing methods could be readily applied in our setting. We evaluate our explainability results with annotations by legal experts, showing significant differences between point of view of algorithms and experts. We also perform a detailed case study to understand the annotations by lawyers (for more details please check the paper)
12 | 
13 | ## License
14 | 
15 | [![License: CC BY-NC 4.0](https://img.shields.io/badge/License-CC%20BY--NC%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc/4.0/)
16 | 
17 | 
18 | The ILDC dataset and CJPE software follows [CC-BY-NC](CC-BY-NC) license. Thus, users can share and adapt our dataset if they give credit to us and do not use our dataset for any commercial purposes.
19 | 
20 | ## Citation
21 | 
22 | ```
23 | @inproceedings{malik-etal-2021-ildc,
24 |     title = "{ILDC} for {CJPE}: {I}ndian Legal Documents Corpus for Court Judgment Prediction and Explanation",
25 |     author = "Malik, Vijit  and
26 |       Sanjay, Rishabh  and
27 |       Nigam, Shubham Kumar  and
28 |       Ghosh, Kripabandhu  and
29 |       Guha, Shouvik Kumar  and
30 |       Bhattacharya, Arnab  and
31 |       Modi, Ashutosh",
32 |     booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
33 |     month = aug,
34 |     year = "2021",
35 |     address = "Online",
36 |     publisher = "Association for Computational Linguistics",
37 |     url = "https://aclanthology.org/2021.acl-long.313",
38 |     doi = "10.18653/v1/2021.acl-long.313",
39 |     pages = "4046--4062",
40 |     abstract = "An automated system that could assist a judge in predicting the outcome of a case would help expedite the judicial process. For such a system to be practically useful, predictions by the system should be explainable. To promote research in developing such a system, we introduce ILDC (Indian Legal Documents Corpus). ILDC is a large corpus of 35k Indian Supreme Court cases annotated with original court decisions. A portion of the corpus (a separate test set) is annotated with gold standard explanations by legal experts. Based on ILDC, we propose the task of Court Judgment Prediction and Explanation (CJPE). The task requires an automated system to predict an explainable outcome of a case. We experiment with a battery of baseline models for case predictions and propose a hierarchical occlusion based model for explainability. Our best prediction model has an accuracy of 78{\%} versus 94{\%} for human legal experts, pointing towards the complexity of the prediction task. The analysis of explanations by the proposed algorithm reveals a significant difference in the point of view of the algorithm and legal experts for explaining the judgments, pointing towards scope for future research.",
41 | }
42 | ```
43 | 
44 | ## Contact
45 | 
46 | In case of any queries, please contact <ashutoshm.iitk@gmail.com>
47 | 


--------------------------------------------------------------------------------
/Results/Attention Score vs Averaged Chunk Size all 25.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/Attention Score vs Averaged Chunk Size all 25.jpg


--------------------------------------------------------------------------------
/Results/BERT visualization.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/BERT visualization.jpg


--------------------------------------------------------------------------------
/Results/BERT_tsne_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/BERT_tsne_test.png


--------------------------------------------------------------------------------
/Results/BLEU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/BLEU.png


--------------------------------------------------------------------------------
/Results/Doc2Vec_tsne_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/Doc2Vec_tsne_test.png


--------------------------------------------------------------------------------
/Results/JACCARD SIMILARITY.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/JACCARD SIMILARITY.png


--------------------------------------------------------------------------------
/Results/METEOR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/METEOR.png


--------------------------------------------------------------------------------
/Results/OVERLAP-MAX.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/OVERLAP-MAX.png


--------------------------------------------------------------------------------
/Results/OVERLAP-MIN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/OVERLAP-MIN.png


--------------------------------------------------------------------------------
/Results/Occlusion Score vs Averaged Chunk Size all 25.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/Occlusion Score vs Averaged Chunk Size all 25.jpg


--------------------------------------------------------------------------------
/Results/README.md:
--------------------------------------------------------------------------------
 1 | The main results are there in the paper due to space constraints we are not able to show additional results and experiments. These results are extension to the results in the paper.
 2 | 
 3 | ## Token visualization heatmap using Integrated Gradient for BERT model (document name 1951_33.txt)
 4 | 
 5 | <img src = "BERT visualization.jpg" width = 80%>
 6 | 
 7 | ## Explanation agreement among the annotators
 8 | 
 9 | <img src = "ROUGE-1.png" width = 50%>
10 | 
11 | <img src = "ROUGE-2.png" width = 50%>
12 | 
13 | <img src = "BLEU.png" width = 50%>
14 | 
15 | <img src = "JACCARD SIMILARITY.png" width = 50%>
16 | 
17 | <img src = "METEOR.png" width = 50%>
18 | 
19 | <img src = "OVERLAP-MAX.png" width = 50%>
20 | 
21 | <img src = "OVERLAP-MIN.png" width = 50%>
22 | 
23 | ## TSNE visualization on the test set using the BERT 
24 | 
25 | <img src = "BERT_tsne_test.png" width = 70%>
26 | 
27 | ## TSNE visualization on the test set using the Doc2Vec
28 | 
29 | <img src = "Doc2Vec_tsne_test.png" width = 70%>
30 | 
31 | ## Plot between different Averaged Chunk sizes vs Attention Score
32 | 
33 | <img src = "Attention Score vs Averaged Chunk Size all 25.jpg" width = 100%>
34 | 
35 | ## Plot between different Averaged Chunk sizes vs Occlusion Score
36 | 
37 | <img src = "Occlusion Score vs Averaged Chunk Size all 25.jpg" width = 100%>
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/Results/ROUGE-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/ROUGE-1.png


--------------------------------------------------------------------------------
/Results/ROUGE-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/ROUGE-2.png


--------------------------------------------------------------------------------
/Results/stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exploration-Lab/CJPE/8649506fc3cf771e5b463bd3adaad60ce04b5186/Results/stats.png


--------------------------------------------------------------------------------