├── .gitignore ├── README.md ├── abstractive ├── BART_based_approaches │ ├── BART_fineTune.ipynb │ ├── BART_utilities.py │ ├── Readme.md │ ├── generate_summaries_chunking_BART.ipynb │ ├── generate_summaries_chunking_BART_RR.ipynb │ └── generate_summaries_chunking_BERT_BART.ipynb ├── BertSum-Abs │ ├── PreSumm │ │ ├── LICENSE │ │ ├── README.md │ │ ├── bert_data │ │ │ └── .gitignore │ │ ├── json_data │ │ │ └── cnndm_sample.train.0.json │ │ ├── logs │ │ │ └── .gitignore │ │ ├── models │ │ │ └── .gitignore │ │ ├── raw_data │ │ │ ├── temp.raw_src │ │ │ ├── temp.raw_tgt │ │ │ └── temp_ext.raw_src │ │ ├── requirements.txt │ │ ├── results │ │ │ └── .gitignore │ │ ├── src │ │ │ ├── cal_rouge.py │ │ │ ├── distributed.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── adam.py │ │ │ │ ├── data_loader.py │ │ │ │ ├── decoder.py │ │ │ │ ├── encoder.py │ │ │ │ ├── loss.py │ │ │ │ ├── model_builder.py │ │ │ │ ├── neural.py │ │ │ │ ├── optimizers.py │ │ │ │ ├── predictor.py │ │ │ │ ├── reporter.py │ │ │ │ ├── reporter_ext.py │ │ │ │ ├── trainer.py │ │ │ │ └── trainer_ext.py │ │ │ ├── others │ │ │ │ ├── __init__.py │ │ │ │ ├── logging.py │ │ │ │ ├── pyrouge.py │ │ │ │ ├── tokenization.py │ │ │ │ └── utils.py │ │ │ ├── post_stats.py │ │ │ ├── prepro │ │ │ │ ├── __init__.py │ │ │ │ ├── data_builder.py │ │ │ │ ├── smart_common_words.txt │ │ │ │ └── utils.py │ │ │ ├── preprocess.py │ │ │ ├── train.py │ │ │ ├── train_abstractive.py │ │ │ ├── train_extractive.py │ │ │ └── translate │ │ │ │ ├── __init__.py │ │ │ │ ├── beam.py │ │ │ │ └── penalties.py │ │ └── urls │ │ │ ├── cnn_mapping_test.txt │ │ │ ├── cnn_mapping_train.txt │ │ │ ├── cnn_mapping_valid.txt │ │ │ ├── mapping_test.txt │ │ │ └── mapping_valid.txt │ ├── Readme.md │ └── chunker.ipynb ├── BigBird │ ├── Pretrained_Bigbird.ipynb │ └── Readme.md ├── Generate_fine_tuning_data │ ├── Readme.md │ ├── gen_fine_tuning_data_CLS.ipynb │ ├── gen_fine_tuning_data_MCS.ipynb │ ├── gen_fine_tuning_data_MCS_RR.ipynb │ └── gen_fine_tuning_data_SIF.ipynb ├── Legal-LED │ ├── Legal-LED_Finetune.ipynb │ ├── Legal-LED_generate_summary.ipynb │ └── Readme.md ├── Legal-Pegasus │ ├── Readme.md │ ├── pegasus-test.ipynb │ └── pegasus_finetune.ipynb ├── Pointer Generator │ ├── chunker.ipynb │ ├── combiner.ipynb │ └── readme.md ├── README.md └── utilities.py ├── dataset └── README.md └── extractive ├── CaseSummarizer ├── README.md ├── dictionary.txt ├── gitkeep ├── preprocess.py ├── sample folder │ ├── gitkeep │ └── sample input.txt ├── sample_processed │ ├── gitkeep │ └── sample input.txt ├── sample_shortsumm │ ├── gitkeep │ └── sample input.txt ├── sample_summary │ ├── gitkeep │ └── sample input.txt ├── summary.py └── summary_length.py ├── Gist ├── README.md ├── codes │ ├── generate_features.py │ ├── generate_summary.py │ ├── gitkeep │ ├── imp_words.py │ ├── open_words.py │ ├── pos_tags.py │ ├── quant_feats.py │ ├── sent_pos.py │ ├── train_classifier.py │ ├── word2vec.py │ └── word_emb.py ├── cue_phrases.txt ├── generate_features.py ├── gitkeep ├── infer.py ├── length.txt ├── postags.txt └── train.py ├── GraphicalModel ├── README.md ├── crf_alltrain.model ├── crf_test.py ├── crf_train.py ├── extract-categories.py ├── formulated_constants.py ├── get-n-grams.py ├── gitkeep ├── graphicalModel.py ├── k_mix_model_test.py ├── k_mix_output.txt └── results_crf.txt ├── LetSum ├── README.md ├── extract-categories.py ├── formulated_constants.py ├── get-n-grams.py ├── gitkeep ├── letsum.py ├── letsum_test.py ├── modified_letsum.py └── results_letsum.txt ├── MMR ├── MMR.py ├── README.md └── gitkeep ├── RBM ├── README.md ├── Summarizer.py ├── enhancedfeatureSum ├── entity2.py ├── extractiveDemo.py ├── featureSum ├── gitkeep ├── logistic_sgd.py ├── ner.py ├── numofSen.py ├── para_reader.py ├── parseForExtract.py ├── plot.py ├── plotLines.py ├── rbm.py ├── saveCre.py └── sentence_labeller.py ├── README.md ├── abs_to_ext ├── README.md ├── extractive_labels.py └── gitkeep ├── annotated ├── A.R._KRISHNAMURTHY-annotated.txt ├── AMAR_KANT_CHOUDHARY-annotated.txt ├── Andhra_sugars.json ├── Archit.json ├── BIHARI_CHOWDHARY-annotated.txt ├── BISHNU_CHAND_LAL_CHAUDHARY-annotated.txt ├── Bhayana.json ├── Chandrachud.json ├── Commissioner_delhi.json ├── GANPAT_GIRI-annotated.txt ├── GUNENDRA_PRASAD_SEN_GUPTA-annotated.txt ├── Gwalior.json ├── Hameedullah.json ├── JCShah.json ├── KRISHNA_ALIAS_RAJU-annotated.txt ├── KUMAR_SUDHENDU_NARAIN_DEB-annotated.txt ├── Kapur.json ├── Ramaswamy.json ├── SAKHKKAR_MILLS_MAZDOOR_SANGH-annotated.txt ├── Sabyasachi_Mukherjee.json ├── Subbarao.json ├── V_P_Sayed_Mohammed-Iteration_02-annotated.txt ├── Writ_petition.json ├── Write_petition_civil.json ├── gitkeep └── kalawati.json └── top_ngrams ├── gitkeep ├── top_bigrams.txt ├── top_quadrigrams.txt ├── top_trigrams.txt └── top_unigrams.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This repository contains implementations and datasets made available in the following papers : 4 | 5 | - Legal Case Document Summarization: Extractive and Abstractive Methods and their Evaluation accepted at AACL-IJCNLP 2022. 6 | - A Comparative Study of Summarization Algorithms Applied to Legal Case Judgments accepted at ECIR 2019. 7 | 8 | We provide the code base of various Extractive and Abstractive summarization algorithms applied to legal case documents. We also provide three summarization datasets from the Indian and UK Supreme Court case documents. 9 | 10 | # Dataset 11 | 12 | We make available 3 legal document summarization datasets. Please refer to the *dataset* folder for more details : 13 | 14 | - IN-Abs : Indian Supreme Court case documents & their *abstractive* summaries, obtained from http://www.liiofindia.org/in/cases/cen/INSC/ 15 | - IN-Ext : Indian Supreme Court case documents & their *extractive* summaries, written by two law experts (A1, A2). 16 | - UK-Abs : United Kingdom (U.K.) Supreme Court case documents & their *abstractive* summaries, obtained from https://www.supremecourt.uk/decided-cases/ 17 | 18 | 19 | # Extractive Summarization methods 20 | 21 | We provide the implementations of the following methods in the *extractive* folder of the repository : 22 | 23 | - [Improving Legal Document Summarization Using Graphical Models](https://www.cse.iitm.ac.in/~ravi/papers/Saravanan_jurix_06.pdf), JURIX 2006 24 | - [LetSum, a Text Summarization system in Law field](http://rali.iro.umontreal.ca/rali/?q=en/node/673), JURIX 2004 25 | - [CaseSummarizer](http://www.aclweb.org/anthology/C16-2054), COLING 2016 26 | - [Automatic Summarization of Legal Decisions using Iterative Masking of Predictive Sentences](https://dl.acm.org/doi/10.1145/3322640.3326728), ICAIL 2019 27 | - [Extracting the Gist of Chinese Judgments of the Supreme Court](https://dl.acm.org/doi/10.1145/3322640.3326715), ICAIL 2019 28 | - [Extractive Summarization using Deep Learning](https://arxiv.org/pdf/1708.04439.pdf), arXiv preprint, 2017 29 | - We also provide the script to convert an abstractive gold standard summary to extractive, for training supervised extractive summarization methods. 30 | 31 | 32 | # Abstractive Summarization methods 33 | 34 | - Scripts to generate fine tuning data using following techniques 35 | - CLS 36 | - SIF 37 | - MCS 38 | - MCS_RR 39 | - Scripts to fine-tune following models 40 | - [BART](https://huggingface.co/facebook/BART_large) 41 | - [legal-Pegasus](https://huggingface.co/nsi319/legal-pegasus) 42 | - [Legal-LED](https://huggingface.co/nsi319/legal-led-base-16384) 43 | - Scripts to generate summaries using following models/methods 44 | - BART(CLS, MCS, SIF) 45 | - BART_RR 46 | - BERT-BART 47 | - Legal-LED 48 | - Pegasus 49 | - Helper chunking and combiner scripts for BertSum-Abs and Pointer Generator methods 50 | 51 | # Availability of Trained Models 52 | 53 | The following models trained on the IN-Abs & UK-Abs datasets are publicly available at https://zenodo.org/record/7234359#.Y2tShdJByC1 54 | 55 | - Extractive : SummaRuNNer, Gist 56 | 57 | - Abstractive : Legal-Pegasus, Legal-LED 58 | 59 | 63 | 64 | # Citation 65 | If you are using the implementations / datasets / trained models, please refer to the following papers: 66 | ``` 67 | @inproceedings{shukla2022, 68 | title={Legal Case Document Summarization: Extractive and Abstractive Methods and their Evaluation}, 69 | author={Shukla, Abhay and Bhattacharya, Paheli and Poddar, Soham and Mukherjee, Rajdeep and Ghosh, Kripabandhu and Goyal, Pawan and Ghosh, Saptarshi}, 70 | booktitle={The 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing}, 71 | year={2022} 72 | } 73 | 74 | @inproceedings{bhattacharya2019comparative, 75 | title={A comparative study of summarization algorithms applied to legal case judgments}, 76 | author={Bhattacharya, Paheli and Hiware, Kaustubh and Rajgaria, Subham and Pochhi, Nilay and Ghosh, Kripabandhu and Ghosh, Saptarshi}, 77 | booktitle={European Conference on Information Retrieval}, 78 | pages={413--428}, 79 | year={2019}, 80 | organization={Springer} 81 | } 82 | ``` 83 | 84 | -------------------------------------------------------------------------------- /abstractive/BART_based_approaches/Readme.md: -------------------------------------------------------------------------------- 1 | # Scripts 2 | 3 | BART_fineTune.ipynb - Script for fine-tuning BART model 4 | 5 | generate_summaries_chunking_BART.ipynb - Script to generate summaries using chunking based BART method 6 | 7 | generate_summaries_chunking_BART_RR.ipynb - Script to generate summaries using chunking based BART_RR method 8 | 9 | generate_summaries_chunking_BERT_BART.ipynb - Script to generate summaries using chunking based BERT_BART method 10 | 11 | # Requirements 12 | 13 | transformers 4.12.3 14 | 15 | pytorch 1.10 16 | 17 | pytorch lightning 1.5.1 18 | 19 | # Usage 20 | 21 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py 22 | 2. Follow the instructions given in the notebooks 23 | 24 | Note that rhetorical role-based segmenting requires rhetorical role labeled documents. Each line in the document would contain a sentence, \t, and the label (sentence_1\tlabel). The path to the directory of labeled documents should be updated in the utility.py. 25 | 26 | Rhetorical role labeling - https://link.springer.com/article/10.1007/s10506-021-09304-5 27 | -------------------------------------------------------------------------------- /abstractive/BART_based_approaches/generate_summaries_chunking_BART.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"cea86415","metadata":{"id":"cea86415"},"source":["### Script to generate summaries using chunking based BART method\n","\n","Assign the dataset and output_path variable according to requirements. \n"]},{"cell_type":"code","execution_count":null,"id":"b13ccd09","metadata":{"id":"b13ccd09"},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext \n","output_path = \"./output/\""]},{"cell_type":"code","execution_count":null,"id":"ad7a3c99","metadata":{"id":"ad7a3c99"},"outputs":[],"source":["import sys\n","from BART_utilities import *\n","sys.path.insert(0, '../')\n","from utilities import *\n","\n","import transformers\n","import pandas as pd\n","import numpy as np\n","import glob\n","import nltk\n","import torch\n","import math\n","import random\n","import re\n","import argparse\n","import os\n"]},{"cell_type":"code","execution_count":null,"id":"383b92c3","metadata":{"id":"383b92c3"},"outputs":[],"source":["#Reading the test documents\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))\n","dict_names = get_req_len_dict(dataset, \"test\") "]},{"cell_type":"code","execution_count":null,"id":"ea21bb4b","metadata":{"id":"ea21bb4b"},"outputs":[],"source":["# Loading Model and tokenizer\n","from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig\n","\n","\n","tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)\n","\n","model = BartForConditionalGeneration.from_pretrained(\"facebook/bart-large\")"]},{"cell_type":"markdown","id":"67b07aa2","metadata":{"id":"67b07aa2"},"source":["### For using fine tuned model \n","1. uncomment the 2nd line in the following cell\n","2. add the path to the fine tuned model"]},{"cell_type":"code","execution_count":null,"id":"98713de3","metadata":{"id":"98713de3"},"outputs":[],"source":["bart_model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = model)\n","\n","# bart_model = LitModel.load_from_checkpoint(\"/home/pahelibhattacharya/HULK/Abhay/models/BART_large_IN_MCS.ckpt\",\n","# learning_rate = 2e-5, tokenizer = tokenizer, model = model).to(\"cuda\")"]},{"cell_type":"code","execution_count":null,"id":"39eae844","metadata":{"id":"39eae844"},"outputs":[],"source":["def generate_summary_gpu(nested_sentences,p=0.2):\n"," '''\n"," Function to generate summaries from the list containing chunks of the document\n"," input: nested_sentences - chunks\n"," p - Number of words in summaries per word in the document\n"," output: document summary\n"," '''\n"," device = 'cuda'\n"," summaries = []\n"," for nested in nested_sentences:\n"," l = int(p * len(nested.split(\" \")))\n"," input_tokenized = tokenizer.encode(nested, truncation=True, return_tensors='pt')\n"," input_tokenized = input_tokenized.to(device)\n"," summary_ids = bart_model.model.to(device).generate(input_tokenized,\n"," length_penalty=0.01,\n"," min_length=l-5,\n"," max_length=l+5)\n"," output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]\n"," summaries.append(output)\n"," summaries = [sentence for sublist in summaries for sentence in sublist]\n"," return summaries"]},{"cell_type":"code","execution_count":null,"id":"f473093c","metadata":{"id":"f473093c"},"outputs":[],"source":["import os\n","if not os.path.exists(output_path):\n"," os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"2bdf4344","metadata":{"id":"2bdf4344"},"outputs":[],"source":["# main loop to generate and save summaries of each document in the test dataset\n","for i in range(len(data_source)):\n"," name = names[i]\n"," doc = data_source[i]\n"," wc = doc.split(\" \")\n"," input_len = len(wc)\n"," req_len = dict_names[name]\n"," print(str(i) + \": \" + name + \" - \" + str(input_len) + \" : \" + str(req_len), end = \" ,\")\n"," \n"," nested = nest_sentences(doc,1024)\n"," l = int(req_len/len(nested))\n"," p = float(req_len/input_len)\n"," print(p)\n"," \n"," abs_summ = generate_summary_gpu(nested,p)\n","# print(abs_summ)\n","# break\n"," abs_summ = \" \".join(abs_summ)\n"," if len(abs_summ.split(\" \")) > req_len:\n"," abs_summ = abs_summ.split(\" \")\n"," abs_summ = abs_summ[:req_len]\n"," abs_summ = \" \".join(abs_summ)\n","# print(abs_summ)\n","# break\n"," print(len((abs_summ.split(\" \"))))\n"," \n"," path = output_path + name\n"," file = open(path,'w')\n"," file.write(abs_summ)\n"," file.close()\n"," \n"]}],"metadata":{"colab":{"collapsed_sections":[],"name":"generate_summaries_chunking_BART.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5} 2 | -------------------------------------------------------------------------------- /abstractive/BART_based_approaches/generate_summaries_chunking_BART_RR.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"646853af","metadata":{"id":"646853af"},"source":["### Script to generate summaries using chunking based BART RR method\n","\n","Assign the dataset and output_path variable according to requirements. "]},{"cell_type":"code","execution_count":null,"id":"3fd5817d","metadata":{"id":"3fd5817d"},"outputs":[],"source":["dataset = \"N2\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext \n","output_path = \"./output/\""]},{"cell_type":"code","execution_count":null,"id":"ad7a3c99","metadata":{"id":"ad7a3c99"},"outputs":[],"source":["import sys\n","from BART_utilities import *\n","sys.path.insert(0, '../')\n","import transformers\n","import pandas as pd\n","import numpy as np\n","import glob\n","import nltk\n","import torch\n","import math\n","import random\n","import re\n","import argparse\n","import os\n","from utilities import *"]},{"cell_type":"code","execution_count":null,"id":"383b92c3","metadata":{"id":"383b92c3"},"outputs":[],"source":["#Reading the test documents\n","names, data_source = get_summary_data_rhet_test(dataset)\n","print(len(names))\n","print(len(data_source))\n","dict_names = get_req_len_dict(dataset, \"test\") "]},{"cell_type":"code","execution_count":null,"id":"ea21bb4b","metadata":{"id":"ea21bb4b"},"outputs":[],"source":["# Loading Model and tokenizer\n","from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig\n","\n","\n","tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)\n","\n","model = BartForConditionalGeneration.from_pretrained(\"facebook/bart-large\")\n","\n","new_tokens = ['', '', '', '', '

', '', '']\n","# tokenizer.add_special_tokens(new_tokens)\n","\n","special_tokens_dict = {'additional_special_tokens': new_tokens}\n","num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)\n","model.resize_token_embeddings(len(tokenizer))"]},{"cell_type":"markdown","id":"b3aa4218","metadata":{"id":"b3aa4218"},"source":["#### Add the path to fine tuned model"]},{"cell_type":"code","execution_count":null,"id":"865f3020","metadata":{"id":"865f3020"},"outputs":[],"source":["bart_model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = model)\n","# bart_model = LitModel.load_from_checkpoint(\"path to model\",learning_rate = 2e-5, tokenizer = tokenizer, model = model).to(\"cuda\")"]},{"cell_type":"code","execution_count":null,"id":"76586532","metadata":{"id":"76586532"},"outputs":[],"source":["def nest_sentencesV3(doc, chunk_length):\n"," '''\n"," function to first segment the document using rhetorical roles and then chunk if required\n"," input: doc_sents - Input document sentence\n"," chunk_length - chunk length\n"," output: list of chunks\n"," '''\n"," doc_sents, _, dict_sents_labels = get_doc_sens_and_labels(doc)\n"," s = list(set(dict_sents_labels.values()))\n","# print(s)\n"," all_chunks = []\n"," \n"," for label in s:\n"," doc_sents_withlabels = []\n"," for sent in doc_sents:\n"," if sent == '':continue\n"," if dict_sents_labels[sent] == label:\n"," doc_sents_withlabels.append(sent)\n"," chunks = nest_sentencesMV2(doc_sents_withlabels, chunk_length)\n"," \n"," edited_chunks = []\n"," for chunk in chunks:\n"," edited_chunks.append([\"<\" + label + \">\"] + chunk)\n"," #modified\n"," \n"," all_chunks = all_chunks + ['. '.join(i) for i in edited_chunks]\n","\n"," return all_chunks \n"]},{"cell_type":"code","execution_count":null,"id":"39eae844","metadata":{"id":"39eae844"},"outputs":[],"source":["def generate_summary_gpu(nested_sentences,p):\n"," '''\n"," Function to generate summaries from the list containing chunks of the document\n"," input: nested_sentences - chunks\n"," p - Number of words in summaries per word in the document\n"," output: document summary\n"," '''\n"," device = 'cuda'\n"," summaries = []\n"," for nested in nested_sentences:\n"," l = int(p * len(nested.split(\" \")))\n"," input_tokenized = tokenizer.encode(nested, truncation=True, return_tensors='pt')\n"," input_tokenized = input_tokenized.to(device)\n"," summary_ids = bart_model.model.to(device).generate(input_tokenized,\n"," length_penalty=0.05,\n"," min_length=l-5,\n"," max_length=l+5)\n","\n"," output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]\n"," summaries.append(output)\n"," summaries = [sentence for sublist in summaries for sentence in sublist]\n"," return summaries"]},{"cell_type":"code","execution_count":null,"id":"85033199","metadata":{"id":"85033199"},"outputs":[],"source":["import os\n","if not os.path.exists(output_path):\n"," os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"2bdf4344","metadata":{"id":"2bdf4344"},"outputs":[],"source":["# main loop to generate and save summaries of each document in the test dataset\n","output = []\n","for i in range(len(data_source)):\n"," name = names[i]\n"," doc = data_source[i]\n"," wc = doc.split(\" \")\n"," input_len = len(wc)\n"," req_len = dict_names[name]\n"," print(str(i) + \": \" + name + \" - \" + str(input_len) + \" : \" + str(req_len), end = \", \")\n"," \n"," nested = nest_sentencesV3(doc,1024)\n"," p = float(req_len/input_len)\n"," print(p)\n"," abs_summ = generate_summary_gpu(nested,p)\n"," abs_summ = \" \".join(abs_summ)\n"," print(len((abs_summ.split(\" \"))))\n"," \n"," if len(abs_summ.split(\" \")) > req_len:\n"," abs_summ = abs_summ.split(\" \")\n"," abs_summ = abs_summ[:req_len]\n"," abs_summ = \" \".join(abs_summ)\n","# print(abs_summ)\n","# break\n"," print(len((abs_summ.split(\" \"))))\n"," path = output_path + name\n"," file = open(path,'w')\n"," file.write(abs_summ)\n"," file.close()\n"," \n","print(output)"]}],"metadata":{"colab":{"collapsed_sections":[],"name":"generate_summaries_chunking_BART_RR.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5} 2 | -------------------------------------------------------------------------------- /abstractive/BART_based_approaches/generate_summaries_chunking_BERT_BART.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"8793d0c1","metadata":{"id":"8793d0c1"},"source":["### Script to generate summaries using chunking based BERT_BART method\n","\n","Assign the dataset and ouput_folder variable according to requirements. \n"]},{"cell_type":"code","execution_count":null,"id":"b94e8ba9","metadata":{"id":"b94e8ba9"},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext \n","output_path = \"./IN_BERT_BART/\""]},{"cell_type":"code","execution_count":null,"id":"ad7a3c99","metadata":{"id":"ad7a3c99"},"outputs":[],"source":["import sys\n","from BART_utilities import *\n","sys.path.insert(0, '../')\n","import transformers\n","import pandas as pd\n","import numpy as np\n","import glob\n","import nltk\n","import torch\n","from utilities import *\n","import math\n","import random\n","import re\n","import argparse\n","import os"]},{"cell_type":"code","execution_count":null,"id":"383b92c3","metadata":{"id":"383b92c3"},"outputs":[],"source":["#Reading the test documents\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))"]},{"cell_type":"code","execution_count":null,"id":"41e0ecab","metadata":{"id":"41e0ecab"},"outputs":[],"source":["dict_names = get_req_len_dict(dataset, \"test\") "]},{"cell_type":"code","execution_count":null,"id":"ea21bb4b","metadata":{"id":"ea21bb4b"},"outputs":[],"source":["# Loading Model and tokenizer\n","from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig\n","\n","tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)\n","\n","model = BartForConditionalGeneration.from_pretrained(\"facebook/bart-large\")"]},{"cell_type":"markdown","id":"b7902d06","metadata":{"id":"b7902d06"},"source":["### For using fine tuned model \n","1. uncomment the 2nd line\n","2. add the path to the fine tuned model"]},{"cell_type":"code","execution_count":null,"id":"37717f34","metadata":{"id":"37717f34"},"outputs":[],"source":["bart_model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = model)\n","\n","# bart_model = LitModel.load_from_checkpoint(\"/home/pahelibhattacharya/HULK/Abhay/models/BART_large_IN_MCS.ckpt\",\n","# learning_rate = 2e-5, tokenizer = tokenizer, model = model).to(\"cuda\")"]},{"cell_type":"code","execution_count":null,"id":"39eae844","metadata":{"id":"39eae844"},"outputs":[],"source":["def generate_summary_gpu(nested_sentences,p=0.2):\n"," '''\n"," Function to generate summaries from the list containing chunks of the document\n"," input: nested_sentences - chunks\n"," p - Number of words in summaries per word in the document\n"," output: document summary\n"," '''\n"," device = 'cuda'\n"," summaries = []\n"," for nested in nested_sentences:\n"," l = int(p * len(nested.split(\" \")))\n","# print(l)\n"," input_tokenized = tokenizer.encode(nested, truncation=True, return_tensors='pt')\n"," input_tokenized = input_tokenized.to(device)\n"," summary_ids = bart_model.model.to('cuda').generate(input_tokenized,\n"," length_penalty=0.01,\n"," min_length=l-5,\n"," max_length=l+5)\n"," output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]\n"," summaries.append(output)\n"," summaries = [sentence for sublist in summaries for sentence in sublist]\n"," return summaries"]},{"cell_type":"code","execution_count":null,"id":"69571a9f","metadata":{"id":"69571a9f"},"outputs":[],"source":["from summarizer import Summarizer\n","model = Summarizer()"]},{"cell_type":"code","execution_count":null,"id":"93b0d593","metadata":{"id":"93b0d593"},"outputs":[],"source":["def summ_doc(doc, req_len):\n"," '''\n"," function to generate summaries for a document\n"," input: doc - document\n"," req_len - Required summary length\n"," output: summary generated using BERT_BART method\n"," '''\n"," doc_len = len(doc.split(\" \"))\n"," r = (5*1024)/doc_len\n"," if r < 1 and req_len < 5*1024:\n"," ext_result = model(doc, ratio=r)\n"," else:\n"," ext_result = doc\n"," nested = nest_sentences(ext_result,1024)\n"," p = float(req_len/len(ext_result.split(\" \")))\n"," abs_summ = generate_summary_gpu(nested,p)\n"," \n"," summ = \" \".join(abs_summ)\n"," return summ"]},{"cell_type":"code","execution_count":null,"id":"1841dc0c","metadata":{"id":"1841dc0c"},"outputs":[],"source":["import os\n","if not os.path.exists(output_path):\n"," os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"2bdf4344","metadata":{"id":"2bdf4344"},"outputs":[],"source":["# main loop to generate and save summaries of each document in the test dataset\n","import pandas as pd\n","output = []\n","done = glob.glob(output_path)\n","done = [i[i.rfind(\"/\")+1:] for i in done]\n","print(done)\n","\n","for i in range(len(data_source)):\n"," name = names[i]\n"," if name in done:continue\n"," doc = data_source[i]\n"," wc = doc.split(\" \")\n"," input_len = len(wc)\n"," req_len = dict_names[name]\n"," print(str(i) + \": \" + name + \" - \" + str(input_len) + \" : \" + str(req_len))\n"," \n"," abs_summ = summ_doc(doc, req_len)\n"," path = output_path + name\n"," file = open(path,'w')\n"," file.write(abs_summ)\n"," file.close()\n"," \n","print(output)"]}],"metadata":{"colab":{"collapsed_sections":[],"name":"generate_summaries_chunking_BERT_BART.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5} 2 | -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Yang Liu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/bert_data/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/logs/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/raw_data/temp.raw_src: -------------------------------------------------------------------------------- 1 | this Terry Jones had a love of the absurd that contributed much to the anarchic humour of Monty Python's Flying Circus. His style of visual comedy, leavened with a touch of the surreal, inspired many comedians who followed him. It was on Python that he honed his directing skills, notably on Life of Brian and The Meaning of Life. A keen historian, he wrote a number of books and fronted TV documentaries on ancient and medieval history. Terence Graham Parry Jones was born in Colwyn Bay in north Wales on 1 February 1942. His grandparents ran the local amateur operatic society and staged Gilbert and Sullivan concerts on the town's pier each year His family moved to Surrey when he was four but he always felt nostalgic about his native land. "I couldn't bear it and for the longest time I wanted Wales back," he once said. "I still feel very Welsh and feel it's where I should be really." After leaving the Royal Grammar School in Guildford, where he captained the school, he went on to read English at St Edmund Hall, Oxford. However, as he put it, he "strayed into history", the subject in which he graduated. While at Oxford he wrote sketches for the Oxford Revue and performed alongside a fellow student, Michael Palin. 2 | (CNN) An Iranian chess referee says she is frightened to return home after she was criticized online for not wearing the appropriate headscarf during an international tournament. Currently the chief adjudicator at the Women's World Chess Championship held in Russia and China, Shohreh Bayat says she fears arrest after a photograph of her was taken during the event and was then circulated online in Iran. "They are very sensitive about the hijab when we are representing Iran in international events and even sometimes they send a person with the team to control our hijab," Bayat told CNN Sport in a phone interview Tuesday. The headscarf, or the hijab, has been a mandatory part of women's dress in Iran since the 1979 Islamic revolution but, in recent years, some women have mounted opposition and staged protests about headwear rules. Bayat said she had been wearing a headscarf at the tournament but that certain camera angles had made it look like she was not. "If I come back to Iran, I think there are a few possibilities. It is highly possible that they arrest me [...] or it is possible that they invalidate my passport," added Bayat. "I think they want to make an example of me." The photographs were taken at the first stage of the chess championship in Shanghai, China, but Bayat has since flown to Vladivostok, Russia, for the second leg between Ju Wenjun and Aleksandra Goryachkina. She was left "panicked and shocked" when she became aware of the reaction in Iran after checking her phone in the hotel room. The 32-year-old said she felt helpless as websites reportedly condemned her for what some described as protesting the country's compulsory law. Subsequently, Bayat has decided to no longer wear the headscarf. "I'm not wearing it anymore because what is the point? I was just tolerating it, I don't believe in the hijab," she added. "People must be free to choose to wear what they want, and I was only wearing the hijab because I live in Iran and I had to wear it. I had no other choice." Bayat says she sought help from the country's chess federation. She says the federation told her to post an apology on her social media channels. She agreed under the condition that the federation would guarantee her safety but she said they refused. "My husband is in Iran, my parents are in Iran, all my family members are in Iran. I don't have anyone else outside of Iran. I don't know what to say, this is a very hard situation," she said. CNN contacted the Iranian Chess Federation on Tuesday but has yet to receive a response. -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/raw_data/temp.raw_tgt: -------------------------------------------------------------------------------- 1 | this Terry Jones had a love of the absurd that contributed much to the anarchic humour of Monty Python's Flying Circus. 2 | An Iranian chess referee says she is frightened to return home after she was criticized online for not wearing the appropriate headscarf during an international tournament. -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/raw_data/temp_ext.raw_src: -------------------------------------------------------------------------------- 1 | this Terry Jones had a love of the absurd that contributed much to the anarchic humour of Monty Python's Flying Circus. [CLS] [SEP] His style of visual comedy, leavened with a touch of the surreal, inspired many comedians who followed him. [CLS] [SEP] It was on Python that he honed his directing skills, notably on Life of Brian and The Meaning of Life. [CLS] [SEP] A keen historian, he wrote a number of books and fronted TV documentaries on ancient and medieval history. [CLS] [SEP] Terence Graham Parry Jones was born in Colwyn Bay in north Wales on 1 February 1942. [CLS] [SEP] His grandparents ran the local amateur operatic society and staged Gilbert and Sullivan concerts on the town's pier each year His family moved to Surrey when he was four but he always felt nostalgic about his native land. [CLS] [SEP] "I couldn't bear it and for the longest time I wanted Wales back," he once said. [CLS] [SEP] "I still feel very Welsh and feel it's where I should be really." After leaving the Royal Grammar School in Guildford, where he captained the school, he went on to read English at St Edmund Hall, Oxford. [CLS] [SEP] However, as he put it, he "strayed into history", the subject in which he graduated. [CLS] [SEP] While at Oxford he wrote sketches for the Oxford Revue and performed alongside a fellow student, Michael Palin. 2 | (CNN) An Iranian chess referee says she is frightened to return home after she was criticized online for not wearing the appropriate headscarf during an international tournament. [CLS] [SEP] Currently the chief adjudicator at the Women's World Chess Championship held in Russia and China, Shohreh Bayat says she fears arrest after a photograph of her was taken during the event and was then circulated online in Iran. [CLS] [SEP] "They are very sensitive about the hijab when we are representing Iran in international events and even sometimes they send a person with the team to control our hijab," Bayat told CNN Sport in a phone interview Tuesday. [CLS] [SEP] The headscarf, or the hijab, has been a mandatory part of women's dress in Iran since the 1979 Islamic revolution but, in recent years, some women have mounted opposition and staged protests about headwear rules. [CLS] [SEP] Bayat said she had been wearing a headscarf at the tournament but that certain camera angles had made it look like she was not. [CLS] [SEP] "If I come back to Iran, I think there are a few possibilities. [CLS] [SEP] It is highly possible that they arrest me [. [CLS] [SEP]. [CLS] [SEP]. [CLS] [SEP]] or it is possible that they invalidate my passport," added Bayat. [CLS] [SEP] "I think they want to make an example of me. [CLS] [SEP]" The photographs were taken at the first stage of the chess championship in Shanghai, China, but Bayat has since flown to Vladivostok, Russia, for the second leg between Ju Wenjun and Aleksandra Goryachkina. [CLS] [SEP] She was left "panicked and shocked" when she became aware of the reaction in Iran after checking her phone in the hotel room. [CLS] [SEP] The 32-year-old said she felt helpless as websites reportedly condemned her for what some described as protesting the country's compulsory law. [CLS] [SEP] Subsequently, Bayat has decided to no longer wear the headscarf. [CLS] [SEP] "I'm not wearing it anymore because what is the point? I was just tolerating it, I don't believe in the hijab," she added. [CLS] [SEP] "People must be free to choose to wear what they want, and I was only wearing the hijab because I live in Iran and I had to wear it. [CLS] [SEP] I had no other choice. [CLS] [SEP]" Bayat says she sought help from the country's chess federation. [CLS] [SEP] She says the federation told her to post an apology on her social media channels. [CLS] [SEP] She agreed under the condition that the federation would guarantee her safety but she said they refused. [CLS] [SEP] "My husband is in Iran, my parents are in Iran, all my family members are in Iran. [CLS] [SEP] I don't have anyone else outside of Iran. [CLS] [SEP] I don't know what to say, this is a very hard situation," she said. [CLS] [SEP] CNN contacted the Iranian Chess Federation on Tuesday but has yet to receive a response. -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/requirements.txt: -------------------------------------------------------------------------------- 1 | multiprocess==0.70.9 2 | numpy==1.17.2 3 | pyrouge==0.1.3 4 | pytorch-transformers==1.2.0 5 | tensorboardX==1.9 6 | torch==1.1.0 -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/results/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/cal_rouge.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | # from multiprocess import Pool as Pool2 5 | from multiprocessing import Pool 6 | 7 | import shutil 8 | import sys 9 | import codecs 10 | 11 | # from onmt.utils.logging import init_logger, logger 12 | from others import pyrouge 13 | 14 | 15 | def process(data): 16 | candidates, references, pool_id = data 17 | cnt = len(candidates) 18 | current_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) 19 | tmp_dir = "rouge-tmp-{}-{}".format(current_time,pool_id) 20 | if not os.path.isdir(tmp_dir): 21 | os.mkdir(tmp_dir) 22 | os.mkdir(tmp_dir + "/candidate") 23 | os.mkdir(tmp_dir + "/reference") 24 | try: 25 | 26 | for i in range(cnt): 27 | if len(references[i]) < 1: 28 | continue 29 | with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w", 30 | encoding="utf-8") as f: 31 | f.write(candidates[i]) 32 | with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w", 33 | encoding="utf-8") as f: 34 | f.write(references[i]) 35 | r = pyrouge.Rouge155() 36 | r.model_dir = tmp_dir + "/reference/" 37 | r.system_dir = tmp_dir + "/candidate/" 38 | r.model_filename_pattern = 'ref.#ID#.txt' 39 | r.system_filename_pattern = r'cand.(\d+).txt' 40 | rouge_results = r.convert_and_evaluate() 41 | print(rouge_results) 42 | results_dict = r.output_to_dict(rouge_results) 43 | finally: 44 | pass 45 | if os.path.isdir(tmp_dir): 46 | shutil.rmtree(tmp_dir) 47 | return results_dict 48 | 49 | 50 | 51 | 52 | def chunks(l, n): 53 | """Yield successive n-sized chunks from l.""" 54 | for i in range(0, len(l), n): 55 | yield l[i:i + n] 56 | 57 | def test_rouge(cand, ref,num_processes): 58 | """Calculate ROUGE scores of sequences passed as an iterator 59 | e.g. a list of str, an open file, StringIO or even sys.stdin 60 | """ 61 | candidates = [line.strip() for line in cand] 62 | references = [line.strip() for line in ref] 63 | 64 | print(len(candidates)) 65 | print(len(references)) 66 | assert len(candidates) == len(references) 67 | candidates_chunks = list(chunks(candidates, int(len(candidates)/num_processes))) 68 | references_chunks = list(chunks(references, int(len(references)/num_processes))) 69 | n_pool = len(candidates_chunks) 70 | arg_lst = [] 71 | for i in range(n_pool): 72 | arg_lst.append((candidates_chunks[i],references_chunks[i],i)) 73 | pool = Pool(n_pool) 74 | results = pool.map(process,arg_lst) 75 | final_results = {} 76 | for i,r in enumerate(results): 77 | for k in r: 78 | if(k not in final_results): 79 | final_results[k] = r[k]*len(candidates_chunks[i]) 80 | else: 81 | final_results[k] += r[k] * len(candidates_chunks[i]) 82 | for k in final_results: 83 | final_results[k] = final_results[k]/len(candidates) 84 | return final_results 85 | def rouge_results_to_str(results_dict): 86 | return ">> ROUGE-F(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\nROUGE-R(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\n".format( 87 | results_dict["rouge_1_f_score"] * 100, 88 | results_dict["rouge_2_f_score"] * 100, 89 | # results_dict["rouge_3_f_score"] * 100, 90 | results_dict["rouge_l_f_score"] * 100, 91 | results_dict["rouge_1_recall"] * 100, 92 | results_dict["rouge_2_recall"] * 100, 93 | # results_dict["rouge_3_f_score"] * 100, 94 | results_dict["rouge_l_recall"] * 100 95 | 96 | # ,results_dict["rouge_su*_f_score"] * 100 97 | ) 98 | 99 | 100 | if __name__ == "__main__": 101 | # init_logger('test_rouge.log') 102 | parser = argparse.ArgumentParser() 103 | parser.add_argument('-c', type=str, default="candidate.txt", 104 | help='candidate file') 105 | parser.add_argument('-r', type=str, default="reference.txt", 106 | help='reference file') 107 | parser.add_argument('-p', type=int, default=1, 108 | help='number of processes') 109 | args = parser.parse_args() 110 | print(args.c) 111 | print(args.r) 112 | print(args.p) 113 | if args.c.upper() == "STDIN": 114 | candidates = sys.stdin 115 | else: 116 | candidates = codecs.open(args.c, encoding="utf-8") 117 | references = codecs.open(args.r, encoding="utf-8") 118 | 119 | results_dict = test_rouge(candidates, references,args.p) 120 | # return 0 121 | print(time.strftime('%H:%M:%S', time.localtime()) 122 | ) 123 | print(rouge_results_to_str(results_dict)) 124 | # logger.info(rouge_results_to_str(results_dict)) -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/distributed.py: -------------------------------------------------------------------------------- 1 | """ Pytorch Distributed utils 2 | This piece of code was heavily inspired by the equivalent of Fairseq-py 3 | https://github.com/pytorch/fairseq 4 | """ 5 | 6 | 7 | from __future__ import print_function 8 | 9 | import math 10 | import pickle 11 | 12 | import torch.distributed 13 | 14 | from others.logging import logger 15 | 16 | 17 | def is_master(gpu_ranks, device_id): 18 | return gpu_ranks[device_id] == 0 19 | 20 | 21 | def multi_init(device_id, world_size,gpu_ranks): 22 | print(gpu_ranks) 23 | dist_init_method = 'tcp://localhost:10000' 24 | dist_world_size = world_size 25 | torch.distributed.init_process_group( 26 | backend='nccl', init_method=dist_init_method, 27 | world_size=dist_world_size, rank=gpu_ranks[device_id]) 28 | gpu_rank = torch.distributed.get_rank() 29 | if not is_master(gpu_ranks, device_id): 30 | # print('not master') 31 | logger.disabled = True 32 | 33 | return gpu_rank 34 | 35 | 36 | 37 | def all_reduce_and_rescale_tensors(tensors, rescale_denom, 38 | buffer_size=10485760): 39 | """All-reduce and rescale tensors in chunks of the specified size. 40 | 41 | Args: 42 | tensors: list of Tensors to all-reduce 43 | rescale_denom: denominator for rescaling summed Tensors 44 | buffer_size: all-reduce chunk size in bytes 45 | """ 46 | # buffer size in bytes, determine equiv. # of elements based on data type 47 | buffer_t = tensors[0].new( 48 | math.ceil(buffer_size / tensors[0].element_size())).zero_() 49 | buffer = [] 50 | 51 | def all_reduce_buffer(): 52 | # copy tensors into buffer_t 53 | offset = 0 54 | for t in buffer: 55 | numel = t.numel() 56 | buffer_t[offset:offset+numel].copy_(t.view(-1)) 57 | offset += numel 58 | 59 | # all-reduce and rescale 60 | torch.distributed.all_reduce(buffer_t[:offset]) 61 | buffer_t.div_(rescale_denom) 62 | 63 | # copy all-reduced buffer back into tensors 64 | offset = 0 65 | for t in buffer: 66 | numel = t.numel() 67 | t.view(-1).copy_(buffer_t[offset:offset+numel]) 68 | offset += numel 69 | 70 | filled = 0 71 | for t in tensors: 72 | sz = t.numel() * t.element_size() 73 | if sz > buffer_size: 74 | # tensor is bigger than buffer, all-reduce and rescale directly 75 | torch.distributed.all_reduce(t) 76 | t.div_(rescale_denom) 77 | elif filled + sz > buffer_size: 78 | # buffer is full, all-reduce and replace buffer with grad 79 | all_reduce_buffer() 80 | buffer = [t] 81 | filled = sz 82 | else: 83 | # add tensor to buffer 84 | buffer.append(t) 85 | filled += sz 86 | 87 | if len(buffer) > 0: 88 | all_reduce_buffer() 89 | 90 | 91 | def all_gather_list(data, max_size=4096): 92 | """Gathers arbitrary data from all nodes into a list.""" 93 | world_size = torch.distributed.get_world_size() 94 | if not hasattr(all_gather_list, '_in_buffer') or \ 95 | max_size != all_gather_list._in_buffer.size(): 96 | all_gather_list._in_buffer = torch.cuda.ByteTensor(max_size) 97 | all_gather_list._out_buffers = [ 98 | torch.cuda.ByteTensor(max_size) 99 | for i in range(world_size) 100 | ] 101 | in_buffer = all_gather_list._in_buffer 102 | out_buffers = all_gather_list._out_buffers 103 | 104 | enc = pickle.dumps(data) 105 | enc_size = len(enc) 106 | if enc_size + 2 > max_size: 107 | raise ValueError( 108 | 'encoded data exceeds max_size: {}'.format(enc_size + 2)) 109 | assert max_size < 255*256 110 | in_buffer[0] = enc_size // 255 # this encoding works for max_size < 65k 111 | in_buffer[1] = enc_size % 255 112 | in_buffer[2:enc_size+2] = torch.ByteTensor(list(enc)) 113 | 114 | torch.distributed.all_gather(out_buffers, in_buffer.cuda()) 115 | 116 | results = [] 117 | for i in range(world_size): 118 | out_buffer = out_buffers[i] 119 | size = (255 * out_buffer[0].item()) + out_buffer[1].item() 120 | 121 | bytes_list = bytes(out_buffer[2:size+2].tolist()) 122 | result = pickle.loads(bytes_list) 123 | results.append(result) 124 | return results 125 | -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Law-AI/summarization/77edff66005cca4c897e608db6d47509c9e8d029/abstractive/BertSum-Abs/PreSumm/src/models/__init__.py -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/models/adam.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.optim.optimizer import Optimizer 4 | 5 | 6 | class Adam(Optimizer): 7 | r"""Implements Adam algorithm. 8 | 9 | It has been proposed in `Adam: A Method for Stochastic Optimization`_. 10 | 11 | Arguments: 12 | params (iterable): iterable of parameters to optimize or dicts defining 13 | parameter groups 14 | lr (float, optional): learning rate (default: 1e-3) 15 | betas (Tuple[float, float], optional): coefficients used for computing 16 | running averages of gradient and its square (default: (0.9, 0.999)) 17 | eps (float, optional): term added to the denominator to improve 18 | numerical stability (default: 1e-8) 19 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 20 | amsgrad (boolean, optional): whether to use the AMSGrad variant of this 21 | algorithm from the paper `On the Convergence of Adam and Beyond`_ 22 | (default: False) 23 | 24 | .. _Adam\: A Method for Stochastic Optimization: 25 | https://arxiv.org/abs/1412.6980 26 | .. _On the Convergence of Adam and Beyond: 27 | https://openreview.net/forum?id=ryQu7f-RZ 28 | """ 29 | 30 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, 31 | weight_decay=0, amsgrad=False): 32 | if not 0.0 <= lr: 33 | raise ValueError("Invalid learning rate: {}".format(lr)) 34 | if not 0.0 <= eps: 35 | raise ValueError("Invalid epsilon value: {}".format(eps)) 36 | if not 0.0 <= betas[0] < 1.0: 37 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 38 | if not 0.0 <= betas[1] < 1.0: 39 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 40 | defaults = dict(lr=lr, betas=betas, eps=eps, 41 | weight_decay=weight_decay, amsgrad=amsgrad) 42 | super(Adam, self).__init__(params, defaults) 43 | 44 | def __setstate__(self, state): 45 | super(Adam, self).__setstate__(state) 46 | for group in self.param_groups: 47 | group.setdefault('amsgrad', False) 48 | 49 | def step(self, closure=None): 50 | """Performs a single optimization step. 51 | Arguments: 52 | closure (callable, optional): A closure that reevaluates the model 53 | and returns the loss. 54 | """ 55 | loss = None 56 | if closure is not None: 57 | loss = closure() 58 | 59 | 60 | for group in self.param_groups: 61 | for p in group['params']: 62 | if p.grad is None: 63 | continue 64 | grad = p.grad.data 65 | if grad.is_sparse: 66 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 67 | 68 | state = self.state[p] 69 | 70 | # State initialization 71 | if len(state) == 0: 72 | state['step'] = 0 73 | # Exponential moving average of gradient values 74 | state['next_m'] = torch.zeros_like(p.data) 75 | # Exponential moving average of squared gradient values 76 | state['next_v'] = torch.zeros_like(p.data) 77 | 78 | next_m, next_v = state['next_m'], state['next_v'] 79 | beta1, beta2 = group['betas'] 80 | 81 | # Decay the first and second moment running average coefficient 82 | # In-place operations to update the averages at the same time 83 | next_m.mul_(beta1).add_(1 - beta1, grad) 84 | next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) 85 | update = next_m / (next_v.sqrt() + group['eps']) 86 | 87 | # Just adding the square of the weights to the loss function is *not* 88 | # the correct way of using L2 regularization/weight decay with Adam, 89 | # since that will interact with the m and v parameters in strange ways. 90 | # 91 | # Instead we want to decay the weights in a manner that doesn't interact 92 | # with the m/v parameters. This is equivalent to adding the square 93 | # of the weights to the loss with plain (non-momentum) SGD. 94 | if group['weight_decay'] > 0.0: 95 | update += group['weight_decay'] * p.data 96 | 97 | lr_scheduled = group['lr'] 98 | 99 | update_with_lr = lr_scheduled * update 100 | p.data.add_(-update_with_lr) 101 | 102 | state['step'] += 1 103 | 104 | # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 105 | # No bias correction 106 | # bias_correction1 = 1 - beta1 ** state['step'] 107 | # bias_correction2 = 1 - beta2 ** state['step'] 108 | 109 | return loss -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/models/encoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from models.neural import MultiHeadedAttention, PositionwiseFeedForward 7 | 8 | 9 | class Classifier(nn.Module): 10 | def __init__(self, hidden_size): 11 | super(Classifier, self).__init__() 12 | self.linear1 = nn.Linear(hidden_size, 1) 13 | self.sigmoid = nn.Sigmoid() 14 | 15 | def forward(self, x, mask_cls): 16 | h = self.linear1(x).squeeze(-1) 17 | sent_scores = self.sigmoid(h) * mask_cls.float() 18 | return sent_scores 19 | 20 | 21 | class PositionalEncoding(nn.Module): 22 | 23 | def __init__(self, dropout, dim, max_len=5000): 24 | pe = torch.zeros(max_len, dim) 25 | position = torch.arange(0, max_len).unsqueeze(1) 26 | div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * 27 | -(math.log(10000.0) / dim))) 28 | pe[:, 0::2] = torch.sin(position.float() * div_term) 29 | pe[:, 1::2] = torch.cos(position.float() * div_term) 30 | pe = pe.unsqueeze(0) 31 | super(PositionalEncoding, self).__init__() 32 | self.register_buffer('pe', pe) 33 | self.dropout = nn.Dropout(p=dropout) 34 | self.dim = dim 35 | 36 | def forward(self, emb, step=None): 37 | emb = emb * math.sqrt(self.dim) 38 | if (step): 39 | emb = emb + self.pe[:, step][:, None, :] 40 | 41 | else: 42 | emb = emb + self.pe[:, :emb.size(1)] 43 | emb = self.dropout(emb) 44 | return emb 45 | 46 | def get_emb(self, emb): 47 | return self.pe[:, :emb.size(1)] 48 | 49 | 50 | class TransformerEncoderLayer(nn.Module): 51 | def __init__(self, d_model, heads, d_ff, dropout): 52 | super(TransformerEncoderLayer, self).__init__() 53 | 54 | self.self_attn = MultiHeadedAttention( 55 | heads, d_model, dropout=dropout) 56 | self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) 57 | self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) 58 | self.dropout = nn.Dropout(dropout) 59 | 60 | def forward(self, iter, query, inputs, mask): 61 | if (iter != 0): 62 | input_norm = self.layer_norm(inputs) 63 | else: 64 | input_norm = inputs 65 | 66 | mask = mask.unsqueeze(1) 67 | context = self.self_attn(input_norm, input_norm, input_norm, 68 | mask=mask) 69 | out = self.dropout(context) + inputs 70 | return self.feed_forward(out) 71 | 72 | 73 | class ExtTransformerEncoder(nn.Module): 74 | def __init__(self, d_model, d_ff, heads, dropout, num_inter_layers=0): 75 | super(ExtTransformerEncoder, self).__init__() 76 | self.d_model = d_model 77 | self.num_inter_layers = num_inter_layers 78 | self.pos_emb = PositionalEncoding(dropout, d_model) 79 | self.transformer_inter = nn.ModuleList( 80 | [TransformerEncoderLayer(d_model, heads, d_ff, dropout) 81 | for _ in range(num_inter_layers)]) 82 | self.dropout = nn.Dropout(dropout) 83 | self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) 84 | self.wo = nn.Linear(d_model, 1, bias=True) 85 | self.sigmoid = nn.Sigmoid() 86 | 87 | def forward(self, top_vecs, mask): 88 | """ See :obj:`EncoderBase.forward()`""" 89 | 90 | batch_size, n_sents = top_vecs.size(0), top_vecs.size(1) 91 | pos_emb = self.pos_emb.pe[:, :n_sents] 92 | x = top_vecs * mask[:, :, None].float() 93 | x = x + pos_emb 94 | 95 | for i in range(self.num_inter_layers): 96 | x = self.transformer_inter[i](i, x, x, 1 - mask) # all_sents * max_tokens * dim 97 | 98 | x = self.layer_norm(x) 99 | sent_scores = self.sigmoid(self.wo(x)) 100 | sent_scores = sent_scores.squeeze(-1) * mask.float() 101 | 102 | return sent_scores 103 | 104 | -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/others/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Law-AI/summarization/77edff66005cca4c897e608db6d47509c9e8d029/abstractive/BertSum-Abs/PreSumm/src/others/__init__.py -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/others/logging.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | import logging 5 | 6 | logger = logging.getLogger() 7 | 8 | 9 | def init_logger(log_file=None, log_file_level=logging.NOTSET): 10 | log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s") 11 | logger = logging.getLogger() 12 | logger.setLevel(logging.INFO) 13 | 14 | console_handler = logging.StreamHandler() 15 | console_handler.setFormatter(log_format) 16 | logger.handlers = [console_handler] 17 | 18 | if log_file and log_file != '': 19 | file_handler = logging.FileHandler(log_file) 20 | file_handler.setLevel(log_file_level) 21 | file_handler.setFormatter(log_format) 22 | logger.addHandler(file_handler) 23 | 24 | return logger 25 | -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/others/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import shutil 4 | import time 5 | 6 | from others import pyrouge 7 | 8 | REMAP = {"-lrb-": "(", "-rrb-": ")", "-lcb-": "{", "-rcb-": "}", 9 | "-lsb-": "[", "-rsb-": "]", "``": '"', "''": '"'} 10 | 11 | 12 | def clean(x): 13 | return re.sub( 14 | r"-lrb-|-rrb-|-lcb-|-rcb-|-lsb-|-rsb-|``|''", 15 | lambda m: REMAP.get(m.group()), x) 16 | 17 | 18 | def process(params): 19 | temp_dir, data = params 20 | candidates, references, pool_id = data 21 | cnt = len(candidates) 22 | current_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) 23 | tmp_dir = os.path.join(temp_dir, "rouge-tmp-{}-{}".format(current_time, pool_id)) 24 | if not os.path.isdir(tmp_dir): 25 | os.mkdir(tmp_dir) 26 | os.mkdir(tmp_dir + "/candidate") 27 | os.mkdir(tmp_dir + "/reference") 28 | try: 29 | 30 | for i in range(cnt): 31 | if len(references[i]) < 1: 32 | continue 33 | with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w", 34 | encoding="utf-8") as f: 35 | f.write(candidates[i]) 36 | with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w", 37 | encoding="utf-8") as f: 38 | f.write(references[i]) 39 | r = pyrouge.Rouge155(temp_dir=temp_dir) 40 | r.model_dir = tmp_dir + "/reference/" 41 | r.system_dir = tmp_dir + "/candidate/" 42 | r.model_filename_pattern = 'ref.#ID#.txt' 43 | r.system_filename_pattern = r'cand.(\d+).txt' 44 | rouge_results = r.convert_and_evaluate() 45 | print(rouge_results) 46 | results_dict = r.output_to_dict(rouge_results) 47 | finally: 48 | pass 49 | if os.path.isdir(tmp_dir): 50 | shutil.rmtree(tmp_dir) 51 | return results_dict 52 | 53 | 54 | def test_rouge(temp_dir, cand, ref): 55 | candidates = [line.strip() for line in open(cand, encoding='utf-8')] 56 | references = [line.strip() for line in open(ref, encoding='utf-8')] 57 | print(len(candidates)) 58 | print(len(references)) 59 | assert len(candidates) == len(references) 60 | 61 | cnt = len(candidates) 62 | current_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) 63 | tmp_dir = os.path.join(temp_dir, "rouge-tmp-{}".format(current_time)) 64 | if not os.path.isdir(tmp_dir): 65 | os.mkdir(tmp_dir) 66 | os.mkdir(tmp_dir + "/candidate") 67 | os.mkdir(tmp_dir + "/reference") 68 | try: 69 | 70 | for i in range(cnt): 71 | if len(references[i]) < 1: 72 | continue 73 | with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w", 74 | encoding="utf-8") as f: 75 | f.write(candidates[i]) 76 | with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w", 77 | encoding="utf-8") as f: 78 | f.write(references[i]) 79 | r = pyrouge.Rouge155(temp_dir=temp_dir) 80 | r.model_dir = tmp_dir + "/reference/" 81 | r.system_dir = tmp_dir + "/candidate/" 82 | r.model_filename_pattern = 'ref.#ID#.txt' 83 | r.system_filename_pattern = r'cand.(\d+).txt' 84 | rouge_results = r.convert_and_evaluate() 85 | print(rouge_results) 86 | results_dict = r.output_to_dict(rouge_results) 87 | finally: 88 | pass 89 | if os.path.isdir(tmp_dir): 90 | shutil.rmtree(tmp_dir) 91 | return results_dict 92 | 93 | 94 | def tile(x, count, dim=0): 95 | """ 96 | Tiles x on dimension dim count times. 97 | """ 98 | perm = list(range(len(x.size()))) 99 | if dim != 0: 100 | perm[0], perm[dim] = perm[dim], perm[0] 101 | x = x.permute(perm).contiguous() 102 | out_size = list(x.size()) 103 | out_size[0] *= count 104 | batch = x.size(0) 105 | x = x.view(batch, -1) \ 106 | .transpose(0, 1) \ 107 | .repeat(count, 1) \ 108 | .transpose(0, 1) \ 109 | .contiguous() \ 110 | .view(*out_size) 111 | if dim != 0: 112 | x = x.permute(perm).contiguous() 113 | return x 114 | 115 | def rouge_results_to_str(results_dict): 116 | return ">> ROUGE-F(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\nROUGE-R(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\n".format( 117 | results_dict["rouge_1_f_score"] * 100, 118 | results_dict["rouge_2_f_score"] * 100, 119 | # results_dict["rouge_3_f_score"] * 100, 120 | results_dict["rouge_l_f_score"] * 100, 121 | results_dict["rouge_1_recall"] * 100, 122 | results_dict["rouge_2_recall"] * 100, 123 | # results_dict["rouge_3_f_score"] * 100, 124 | results_dict["rouge_l_recall"] * 100 125 | 126 | # ,results_dict["rouge_su*_f_score"] * 100 127 | ) 128 | -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/post_stats.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from os import path 3 | from functools import reduce 4 | import re 5 | 6 | def str2bool(v): 7 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 8 | return True 9 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 10 | return False 11 | else: 12 | raise argparse.ArgumentTypeError('Boolean value expected.') 13 | 14 | 15 | 16 | def n_grams(tokens, n): 17 | l = len(tokens) 18 | return [tuple(tokens[i:i + n]) for i in range(l) if i + n < l] 19 | 20 | def has_repeat(elements): 21 | d = set(elements) 22 | return len(d) < len(elements) 23 | 24 | def cal_self_repeat(summary): 25 | ngram_repeats = {2: 0, 4: 0, 8: 0} 26 | sents = summary.split('') 27 | for n in ngram_repeats.keys(): 28 | # Respect sentence boundary 29 | grams = reduce(lambda x, y: x + y, [n_grams(sent.split(), n) for sent in sents], []) 30 | ngram_repeats[n] += has_repeat(grams) 31 | return ngram_repeats 32 | 33 | def cal_novel(summary, gold, source, summary_ngram_novel, gold_ngram_novel): 34 | summary = summary.replace('',' ') 35 | summary = re.sub(r' +', ' ', summary).strip() 36 | gold = gold.replace('',' ') 37 | gold = re.sub(r' +', ' ', gold).strip() 38 | source = source.replace(' ##','') 39 | source = source.replace('[CLS]',' ').replace('[SEP]',' ').replace('[PAD]',' ') 40 | source = re.sub(r' +', ' ', source).strip() 41 | 42 | 43 | for n in summary_ngram_novel.keys(): 44 | summary_grams = set(n_grams(summary.split(), n)) 45 | gold_grams = set(n_grams(gold.split(), n)) 46 | source_grams = set(n_grams(source.split(), n)) 47 | joint = summary_grams.intersection(source_grams) 48 | novel = summary_grams - joint 49 | summary_ngram_novel[n][0] += 1.0*len(novel) 50 | summary_ngram_novel[n][1] += len(summary_grams) 51 | summary_ngram_novel[n][2] += 1.0 * len(novel) / (len(summary.split()) + 1e-6) 52 | joint = gold_grams.intersection(source_grams) 53 | novel = gold_grams - joint 54 | gold_ngram_novel[n][0] += 1.0*len(novel) 55 | gold_ngram_novel[n][1] += len(gold_grams) 56 | gold_ngram_novel[n][2] += 1.0 * len(novel) / (len(gold.split()) + 1e-6) 57 | 58 | 59 | def cal_repeat(args): 60 | candidate_lines = open(args.result_path+'.candidate').read().strip().split('\n') 61 | gold_lines = open(args.result_path+'.gold').read().strip().split('\n') 62 | src_lines = open(args.result_path+'.raw_src').read().strip().split('\n') 63 | lines = zip(candidate_lines,gold_lines,src_lines) 64 | 65 | summary_ngram_novel = {1: [0, 0, 0], 2: [0, 0, 0], 4: [0, 0, 0]} 66 | gold_ngram_novel = {1: [0, 0, 0], 2: [0, 0, 0], 4: [0, 0, 0]} 67 | 68 | for c,g,s in lines: 69 | # self_repeats = cal_self_repeat(c) 70 | cal_novel(c, g, s,summary_ngram_novel, gold_ngram_novel) 71 | print(summary_ngram_novel, gold_ngram_novel) 72 | 73 | for n in summary_ngram_novel.keys(): 74 | # summary_ngram_novel[n] = summary_ngram_novel[n][2]/len(src_lines) 75 | # gold_ngram_novel[n] = gold_ngram_novel[n][2]/len(src_lines) 76 | summary_ngram_novel[n] = summary_ngram_novel[n][0]/summary_ngram_novel[n][1] 77 | gold_ngram_novel[n] = gold_ngram_novel[n][0]/gold_ngram_novel[n][1] 78 | print(summary_ngram_novel, gold_ngram_novel) 79 | 80 | 81 | if __name__ == '__main__': 82 | parser = argparse.ArgumentParser() 83 | parser.add_argument("-mode", default='', type=str) 84 | parser.add_argument("-result_path", default='../../results/cnndm.0') 85 | 86 | 87 | args = parser.parse_args() 88 | eval(args.mode + '(args)') 89 | -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/prepro/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Law-AI/summarization/77edff66005cca4c897e608db6d47509c9e8d029/abstractive/BertSum-Abs/PreSumm/src/prepro/__init__.py -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/prepro/utils.py: -------------------------------------------------------------------------------- 1 | # stopwords = pkgutil.get_data(__package__, 'smart_common_words.txt') 2 | # stopwords = stopwords.decode('ascii').split('\n') 3 | # stopwords = {key.strip(): 1 for key in stopwords} 4 | 5 | 6 | def _get_ngrams(n, text): 7 | """Calcualtes n-grams. 8 | 9 | Args: 10 | n: which n-grams to calculate 11 | text: An array of tokens 12 | 13 | Returns: 14 | A set of n-grams 15 | """ 16 | ngram_set = set() 17 | text_length = len(text) 18 | max_index_ngram_start = text_length - n 19 | for i in range(max_index_ngram_start + 1): 20 | ngram_set.add(tuple(text[i:i + n])) 21 | return ngram_set 22 | 23 | 24 | def _get_word_ngrams(n, sentences): 25 | """Calculates word n-grams for multiple sentences. 26 | """ 27 | assert len(sentences) > 0 28 | assert n > 0 29 | 30 | # words = _split_into_words(sentences) 31 | 32 | words = sum(sentences, []) 33 | # words = [w for w in words if w not in stopwords] 34 | return _get_ngrams(n, words) 35 | -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/preprocess.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | 3 | 4 | import argparse 5 | import time 6 | 7 | from others.logging import init_logger 8 | from prepro import data_builder 9 | 10 | 11 | def do_format_to_lines(args): 12 | print(time.clock()) 13 | data_builder.format_to_lines(args) 14 | print(time.clock()) 15 | 16 | def do_format_to_bert(args): 17 | print(time.clock()) 18 | data_builder.format_to_bert(args) 19 | print(time.clock()) 20 | 21 | 22 | 23 | def do_format_xsum_to_lines(args): 24 | print(time.clock()) 25 | data_builder.format_xsum_to_lines(args) 26 | print(time.clock()) 27 | 28 | def do_tokenize(args): 29 | print(time.clock()) 30 | data_builder.tokenize(args) 31 | print(time.clock()) 32 | 33 | 34 | def str2bool(v): 35 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 36 | return True 37 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 38 | return False 39 | else: 40 | raise argparse.ArgumentTypeError('Boolean value expected.') 41 | 42 | 43 | if __name__ == '__main__': 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument("-pretrained_model", default='bert', type=str) 46 | 47 | parser.add_argument("-mode", default='', type=str) 48 | parser.add_argument("-select_mode", default='greedy', type=str) 49 | parser.add_argument("-map_path", default='../../data/') 50 | parser.add_argument("-raw_path", default='../../line_data') 51 | parser.add_argument("-save_path", default='../../data/') 52 | 53 | parser.add_argument("-shard_size", default=2000, type=int) 54 | parser.add_argument('-min_src_nsents', default=3, type=int) 55 | parser.add_argument('-max_src_nsents', default=100, type=int) 56 | parser.add_argument('-min_src_ntokens_per_sent', default=5, type=int) 57 | parser.add_argument('-max_src_ntokens_per_sent', default=200, type=int) 58 | parser.add_argument('-min_tgt_ntokens', default=5, type=int) 59 | parser.add_argument('-max_tgt_ntokens', default=500, type=int) 60 | 61 | parser.add_argument("-lower", type=str2bool, nargs='?',const=True,default=True) 62 | parser.add_argument("-use_bert_basic_tokenizer", type=str2bool, nargs='?',const=True,default=False) 63 | 64 | parser.add_argument('-log_file', default='../../logs/cnndm.log') 65 | 66 | parser.add_argument('-dataset', default='') 67 | 68 | parser.add_argument('-n_cpus', default=2, type=int) 69 | 70 | 71 | args = parser.parse_args() 72 | init_logger(args.log_file) 73 | eval('data_builder.'+args.mode + '(args)') 74 | -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/translate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Law-AI/summarization/77edff66005cca4c897e608db6d47509c9e8d029/abstractive/BertSum-Abs/PreSumm/src/translate/__init__.py -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/PreSumm/src/translate/penalties.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import torch 3 | 4 | 5 | class PenaltyBuilder(object): 6 | """ 7 | Returns the Length and Coverage Penalty function for Beam Search. 8 | 9 | Args: 10 | length_pen (str): option name of length pen 11 | cov_pen (str): option name of cov pen 12 | """ 13 | 14 | def __init__(self, length_pen): 15 | self.length_pen = length_pen 16 | 17 | def length_penalty(self): 18 | if self.length_pen == "wu": 19 | return self.length_wu 20 | elif self.length_pen == "avg": 21 | return self.length_average 22 | else: 23 | return self.length_none 24 | 25 | """ 26 | Below are all the different penalty terms implemented so far 27 | """ 28 | 29 | 30 | def length_wu(self, beam, logprobs, alpha=0.): 31 | """ 32 | NMT length re-ranking score from 33 | "Google's Neural Machine Translation System" :cite:`wu2016google`. 34 | """ 35 | 36 | modifier = (((5 + len(beam.next_ys)) ** alpha) / 37 | ((5 + 1) ** alpha)) 38 | return (logprobs / modifier) 39 | 40 | def length_average(self, beam, logprobs, alpha=0.): 41 | """ 42 | Returns the average probability of tokens in a sequence. 43 | """ 44 | return logprobs / len(beam.next_ys) 45 | 46 | def length_none(self, beam, logprobs, alpha=0., beta=0.): 47 | """ 48 | Returns unmodified scores. 49 | """ 50 | return logprobs -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/Readme.md: -------------------------------------------------------------------------------- 1 | # Scripts and codes 2 | 3 | PreSumm - Contains the dev branch of https://github.com/nlpyang/PreSumm 4 | 5 | Chunker.ipynb - Script to form chunks from text files 6 | 7 | # Requirements 8 | 9 | multiprocess 0.70.9 10 | numpy 1.17.2 11 | pyrouge 0.1.3 12 | pytorch-transformers 1.2.0 13 | tensorboardX 1.9 14 | torch 1.1.0 15 | 16 | # Usage 17 | 18 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py 19 | 2. Use Chunker to chunk the input documents into chunk files 20 | 3. Use PreSumm to generate summaries for the chunk files -------------------------------------------------------------------------------- /abstractive/BertSum-Abs/chunker.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"111743d0","metadata":{"id":"111743d0"},"source":["### Script to chunk and save every file. Every line in the output file is a chunk. "]},{"cell_type":"code","execution_count":null,"id":"6f05a1f8","metadata":{"id":"6f05a1f8"},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext \n","output_path = \"./output/\""]},{"cell_type":"code","execution_count":null,"id":"82ea4352","metadata":{"id":"82ea4352"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import os\n","import nltk\n","import sys\n","sys.path.insert(0, '../')\n","from utilities import *\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")"]},{"cell_type":"code","execution_count":null,"id":"74b25143","metadata":{"id":"74b25143"},"outputs":[],"source":["import os\n","if not os.path.exists(output_path + './'+ dataset +'_chunked/'):\n"," os.makedirs(output_path + './'+ dataset +'_chunked/')"]},{"cell_type":"code","execution_count":null,"id":"0238b231","metadata":{"id":"0238b231"},"outputs":[],"source":["for i in range(len(data_source)):\n"," doc = data_source[i].replace(\"\\n\", \" \")\n"," chunks = nest_sentences(doc, 512)\n"," save = \"\\n\".join(chunks)\n"," path = output_path + \"./\"+ dataset +\"_chunked/\" + names[i]\n"," file = open(path,'w')\n"," file.write(save)\n"," file.close()\n","# break"]}],"metadata":{"colab":{"name":"chunker.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5} 2 | -------------------------------------------------------------------------------- /abstractive/BigBird/Pretrained_Bigbird.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"0e32b71c","metadata":{"id":"0e32b71c"},"source":["### Script to generate summaries using chunking based pre-trained BigBird model\n","\n","Assign the dataset and output_path variable according to requirements. \n"]},{"cell_type":"code","execution_count":null,"id":"177e102c","metadata":{"id":"177e102c"},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext \n","output_path = \"./IN_BigBird/\""]},{"cell_type":"code","execution_count":null,"id":"80b38f7d","metadata":{"id":"80b38f7d"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import sys\n","sys.path.insert(0, '../')\n","from utilities import *\n","import os\n","import nltk\n","import torch\n","from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer"]},{"cell_type":"code","execution_count":null,"id":"2ed47d74","metadata":{"id":"2ed47d74"},"outputs":[],"source":["if not os.path.exists(output_path):\n"," os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"b6fd18d6","metadata":{"id":"b6fd18d6"},"outputs":[],"source":["#Reading the test documents\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))\n","len_dic = dict_names = get_req_len_dict(dataset, \"test\") "]},{"cell_type":"code","execution_count":null,"id":"2f7b0310","metadata":{"id":"2f7b0310"},"outputs":[],"source":["DATASET_NAME = \"pubmed\"\n","DEVICE = \"cuda:1\"\n","CACHE_DIR = DATASET_NAME\n","MODEL_ID = f\"google/bigbird-pegasus-large-{DATASET_NAME}\"\n","\n","tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n","model = BigBirdPegasusForConditionalGeneration.from_pretrained(MODEL_ID).to(DEVICE)\n"]},{"cell_type":"code","execution_count":null,"id":"9e8f7c79","metadata":{"id":"9e8f7c79"},"outputs":[],"source":["def get_summ(input, max_l, min_l):\n"," '''\n"," Function to generate summaries from the document. This function uses a chunking-based approach.\n"," input: nested_sentences - chunks\n"," p - Number of words in summaries per word in the document\n"," output: document summary\n"," '''\n"," nested = nest_sentences(input, 4096)\n"," summs = []\n"," for chunk in nested:\n"," inputs_dict = tokenizer(chunk, padding=\"max_length\", max_length=4096, return_tensors=\"pt\", truncation=True)\n"," inputs_dict = {k: inputs_dict[k].to(DEVICE) for k in inputs_dict}\n"," predicted_abstract_ids = model.generate(**inputs_dict, min_length=min_l, num_beams=5)\n"," result = tokenizer.decode(predicted_abstract_ids[0], skip_special_tokens=True)\n"," summs.append(result)\n","# print(result)\n"," summ = '. '.join(summs)\n"," return summ\n"," "]},{"cell_type":"code","execution_count":null,"id":"36780f7e","metadata":{"id":"36780f7e"},"outputs":[],"source":["# main loop to generate and save summaries of each document in the test dataset\n","result = []\n","for i in range(len(data_source)):\n"," print(str(i) + \" : \" + names[i])\n"," summ = get_summ(data_source[i], len_dic[names[i]], len_dic[names[i]]-100)\n"," result.append(summ)\n"," path = output_path + names[i]\n"," file = open(path,'w')\n"," file.write(summ)\n"," file.close()\n","# break\n","print(result)"]}],"metadata":{"colab":{"collapsed_sections":[],"name":"Pretrained_Bigbird.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5} 2 | -------------------------------------------------------------------------------- /abstractive/BigBird/Readme.md: -------------------------------------------------------------------------------- 1 | # Scripts and codes 2 | 3 | Pretrained_Bigbird.ipynb - Script to generate summaries using chunking based pre-trained BigBird Model 4 | 5 | # Requirements 6 | 7 | transformers 4.12.3 8 | 9 | # Usage 10 | 11 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py 12 | 2. Follow the instructions given in the notebooks -------------------------------------------------------------------------------- /abstractive/Generate_fine_tuning_data/Readme.md: -------------------------------------------------------------------------------- 1 | # Scripts and codes 2 | 3 | gen_fine_tuning_data_CLS.ipynb - Script to generate fine tuning data using CLS similarity method 4 | 5 | gen_fine_tuning_data_MCS.ipynb - Script to generate fine tuning data using MCS similarity method 6 | 7 | gen_fine_tuning_data_SIF.ipynb - Script to generate fine-tuning data using SIF similarity method 8 | 9 | gen_fine_tuning_data_MCS_RR.ipynb - Script to generate fine-tuning data using MCS similarity method and rhetorical role-based segmenting 10 | 11 | 12 | # Requirements 13 | 14 | transformers 4.12.3 15 | 16 | pytorch 1.10 17 | 18 | pytorch lightning 1.5.1 19 | 20 | bert-extractive-summarizer 0.9.0 21 | 22 | # Usage 23 | 24 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py 25 | 2. Add the UK_freq.json or IN_freq.json to the folder 26 | 3. Follow the instructions given in the notebooks 27 | 28 | Note that rhetorical role-based segmenting requires rhetorical role labeled documents. Each line in the document would contain a sentence, \t, and the label (sentence_1\tlabel). 29 | 30 | Rhetorical role labeling - https://link.springer.com/article/10.1007/s10506-021-09304-5 -------------------------------------------------------------------------------- /abstractive/Generate_fine_tuning_data/gen_fine_tuning_data_CLS.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"06bdfbbf","metadata":{"id":"06bdfbbf"},"source":["### Script to generate fine tuning data using CLS similarity method\n","\n","Change the datasets variable according to the requirements"]},{"cell_type":"code","execution_count":null,"id":"6a8faa6c","metadata":{"id":"6a8faa6c"},"outputs":[],"source":["dataset = \"IN\" # Options: IN, UK "]},{"cell_type":"code","execution_count":null,"id":"605bcef6","metadata":{"id":"605bcef6"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import json\n","import os\n","import sys\n","from tqdm import tqdm\n","sys.path.insert(0, '../')\n","from utilities import *\n","from sklearn.metrics.pairwise import cosine_similarity\n","import torch"]},{"cell_type":"code","execution_count":null,"id":"802a0612","metadata":{"id":"802a0612"},"outputs":[],"source":["#Reading the documents and summaries \n","names, data_source, data_summary = get_summary_data(dataset, \"train\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))"]},{"cell_type":"code","execution_count":null,"id":"9b0c3bff","metadata":{"id":"9b0c3bff"},"outputs":[],"source":["# Loading Model and tokenizer\n","from transformers import AutoTokenizer\n","from transformers import BertModel\n","tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n","bert_model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states = False).to(\"cuda\")\n","bert_model.eval()"]},{"cell_type":"code","execution_count":null,"id":"c14ab500","metadata":{"id":"c14ab500"},"outputs":[],"source":["def get_sen_encoding(sents):\n"," '''\n"," Function to generate encoding for each word in the input list \n"," input: sents - List of sentences\n"," returns the list of the sentence encoding \n"," '''\n"," a = 0.001\n"," answer = None\n"," for sent in sents:\n"," ip =tokenizer(sent, return_tensors='pt', max_length=150, truncation=True, padding='max_length')\n"," tokens = tokenizer.convert_ids_to_tokens(ip['input_ids'][0])\n"," ip = ip.to(\"cuda\")\n"," bert_output = bert_model(**ip)\n"," embedding = bert_output['pooler_output'].clone().detach()\n"," embedding = embedding.to(\"cpu\")\n"," if answer == None:\n"," answer = embedding\n"," answer.resize_(1, 768)\n"," else:\n"," embedding.resize_(1, 768)\n"," answer = torch.cat((answer, embedding),0)\n"," return answer"]},{"cell_type":"code","execution_count":null,"id":"a6665614","metadata":{"id":"a6665614"},"outputs":[],"source":["def similarity_l_l(l1, l2):\n"," '''\n"," Function to find the most similar sentence in the document for each sentence in the summary \n"," input: l1 - Summary sentences\n"," l2 - Document sentences\n"," returns a list of document sentence indexes for each sentence in the summary \n"," '''\n"," l = l1+l2\n"," sents_encodings = get_sen_encoding(l)\n"," similarities=cosine_similarity(sents_encodings)\n"," \n"," result = []\n"," for i in range(len(l1)):\n"," vals = similarities[i]\n"," vals = vals[len(l1):]\n"," idx = np.argmax(vals)\n"," result.append(idx)\n"," return result"]},{"cell_type":"code","execution_count":null,"id":"f574fa42","metadata":{"id":"f574fa42"},"outputs":[],"source":["def get_chunks_data_from_docV2(doc, summ):\n"," '''\n"," Function to generate chunks along with their summaries \n"," input: doc - legal Document\n"," summ - Gold standard summary\n"," returns a list of chunks and their summaries \n"," '''\n"," chunk_summ_word_threshold = 150\n"," sentence_mapping = {}\n"," doc_sents = split_to_sentences(doc)\n"," summ_sents = split_to_sentences(summ)\n"," \n"," result = (similarity_l_l(summ_sents,doc_sents))\n"," \n"," for i in range(len(summ_sents)):\n"," sentence_mapping[doc_sents[result[i]]] = summ_sents[i]\n"," \n"," final_chunks = []\n"," final_summ = []\n"," for chunk in nest_sentencesV2(doc, 1024):\n"," summ = \"\"\n"," for chunk_sent in chunk:\n"," if chunk_sent in sentence_mapping:\n"," summ = summ + sentence_mapping[chunk_sent]\n"," if len(tokenizer.tokenize(summ)) >= chunk_summ_word_threshold:\n"," final_chunks.append(\" \".join(chunk))\n"," final_summ.append(summ)\n"," return final_chunks, final_summ\n"]},{"cell_type":"code","execution_count":null,"id":"680336f0","metadata":{"id":"680336f0"},"outputs":[],"source":["#loop to pass every document, generate the fine tuning data and saving in a excel file \n","import pandas as pd\n","training_chunks = []\n","training_summs = []\n","for i in tqdm(range(len(data_source))):\n"," cks, summs = get_chunks_data_from_docV2(data_source[i],data_summary[i])\n"," training_chunks = training_chunks + cks\n"," training_summs = training_summs + summs\n","# print(i, len(training_summs), end = \", \", sep = \" : \")\n"," if i%100 == 0: \n"," full = list(zip(training_chunks,training_summs))\n"," df = pd.DataFrame(full,columns=['data', 'summary'])\n"," df.to_excel(\"FD_\"+dataset+\"_CLS_BK.xlsx\")\n","# break\n","full = list(zip(training_chunks,training_summs))\n","df = pd.DataFrame(full,columns=['data', 'summary'])\n","df.to_excel(\"FD_\"+dataset+\"_CLS.xlsx\")"]}],"metadata":{"colab":{"collapsed_sections":[],"name":"gen_fine_tuning_data_CLS.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5} 2 | -------------------------------------------------------------------------------- /abstractive/Generate_fine_tuning_data/gen_fine_tuning_data_MCS.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"f1087f67","metadata":{"id":"f1087f67"},"source":["### Script to generate fine tuning data using MCS similarity method\n","\n","Change the datasets variable according to the requirements"]},{"cell_type":"code","execution_count":null,"id":"a4e43880","metadata":{"id":"a4e43880"},"outputs":[],"source":["dataset = \"IN\" # Options: IN, UK "]},{"cell_type":"code","execution_count":null,"id":"d5098bbc","metadata":{"id":"d5098bbc"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import json\n","from tqdm import tqdm\n","import os\n","import sys\n","sys.path.insert(0, '../')\n","from utilities import *\n","# os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""]},{"cell_type":"code","execution_count":null,"id":"e1ea3042","metadata":{"id":"e1ea3042"},"outputs":[],"source":["#Reading the documents and summaries \n","names, data_source, data_summary = get_summary_data(dataset, \"train\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))"]},{"cell_type":"code","execution_count":null,"id":"43bdbed6","metadata":{"id":"43bdbed6"},"outputs":[],"source":["# Loading Model and tokenizer\n","from sentence_transformers import SentenceTransformer\n","from sklearn.metrics.pairwise import cosine_similarity\n","\n","sbert_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens').to(\"cuda\")\n"]},{"cell_type":"code","execution_count":null,"id":"a6665614","metadata":{"id":"a6665614"},"outputs":[],"source":["def similarity_l_l(l1, l2):\n"," '''\n"," Function to find the most similar sentence in the document for each sentence in the summary \n"," input: l1 - Summary sentences\n"," l2 - Document sentences\n"," returns a list of document sentence indexes for each sentence in the summary \n"," '''\n"," document_embeddings = sbert_model.encode(l1+l2)\n"," similarities=cosine_similarity(document_embeddings)\n"," \n"," result = []\n"," for i in range(len(l1)):\n"," vals = similarities[i]\n"," vals = vals[len(l1):]\n"," idx = np.argmax(vals)\n"," result.append(idx)\n"," return result"]},{"cell_type":"code","execution_count":null,"id":"f574fa42","metadata":{"id":"f574fa42"},"outputs":[],"source":["def get_chunks_data_from_docV2(doc, summ):\n"," '''\n"," Function to generate chunks along with their summaries \n"," input: doc - legal Document\n"," summ - Gold standard summary\n"," returns a list of chunks and their summaries \n"," '''\n"," chunk_summ_word_threshold = 150\n"," sentence_mapping = {}\n"," doc_sents = split_to_sentences(doc)\n"," summ_sents = split_to_sentences(summ)\n"," \n"," result = (similarity_l_l(summ_sents,doc_sents))\n"," \n"," for i in range(len(summ_sents)):\n"," sentence_mapping[doc_sents[result[i]]] = summ_sents[i]\n"," \n"," final_chunks = []\n"," final_summ = []\n"," for chunk in nest_sentencesV2(doc, 1024):\n"," summ = \"\"\n"," for chunk_sent in chunk:\n"," if chunk_sent in sentence_mapping:\n"," summ = summ + sentence_mapping[chunk_sent]\n"," if len(summ.split(\" \")) >= chunk_summ_word_threshold:\n"," final_chunks.append(\" \".join(chunk))\n"," final_summ.append(summ)\n"," return final_chunks, final_summ\n","\n"]},{"cell_type":"code","execution_count":null,"id":"680336f0","metadata":{"id":"680336f0"},"outputs":[],"source":["#loop to pass every document, generate the fine tuning data and saving in a excel file \n","import pandas as pd\n","training_chunks = []\n","training_summs = []\n","for i in tqdm(range(len(data_source))):\n"," cks, summs = get_chunks_data_from_docV2(data_source[i],data_summary[i])\n"," training_chunks = training_chunks + cks\n"," training_summs = training_summs + summs\n","# print(i, len(training_summs), end = \", \", sep = \" : \")\n"," if i%100 == 0: \n"," full = list(zip(training_chunks,training_summs))\n"," df = pd.DataFrame(full,columns=['data', 'summary'])\n"," df.to_excel(\"FD_\" + dataset + \"_CSM_BK_512.xlsx\")\n","# break\n","full = list(zip(training_chunks,training_summs))\n","df = pd.DataFrame(full,columns=['data', 'summary'])\n","df.to_excel(\"FD_\" + dataset + \"_CSM_512.xlsx\")"]},{"cell_type":"code","execution_count":null,"id":"7c9b32fb","metadata":{"id":"7c9b32fb"},"outputs":[],"source":[""]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.11"},"colab":{"name":"gen_fine_tuning_data_MCS.ipynb","provenance":[],"collapsed_sections":[]}},"nbformat":4,"nbformat_minor":5} -------------------------------------------------------------------------------- /abstractive/Legal-LED/Legal-LED_generate_summary.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"6cf3e374","metadata":{"id":"6cf3e374"},"source":["### Script to generate summaries using Legal-LED model\n","\n"]},{"cell_type":"code","execution_count":null,"id":"63ad1e2b","metadata":{},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext\n","output_path = \"./output/\""]},{"cell_type":"code","execution_count":null,"id":"e2895caf","metadata":{},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import sys\n","sys.path.insert(0, '../')\n","from utilities import *\n","import os\n","import nltk"]},{"cell_type":"code","execution_count":null,"id":"7d7a6d73","metadata":{},"outputs":[],"source":["if not os.path.exists(output_path):\n"," os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"af7086d5","metadata":{},"outputs":[],"source":["#Reading the test documents\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))\n","len_dic = dict_names = get_req_len_dict(dataset, \"test\")"]},{"cell_type":"code","execution_count":null,"id":"07a2744e","metadata":{},"outputs":[],"source":["import os\n","import re\n","import numpy as np\n","import pandas as pd\n","import json\n","import random\n","import nltk\n","nltk.download('punkt')\n","\n","from IPython.display import display, HTML\n","import torch\n","import datasets\n","from datasets import load_dataset, load_metric, Dataset\n","from transformers import AutoTokenizer\n","from transformers import AutoModelForSeq2SeqLM\n","from transformers import LEDTokenizer, LEDForConditionalGeneration\n","from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments"]},{"cell_type":"code","execution_count":null,"id":"3f97b9a4","metadata":{},"outputs":[],"source":["model_name = \"nsi319/legal-led-base-16384\"\n","tokenizer = AutoTokenizer.from_pretrained(model_name)"]},{"cell_type":"code","execution_count":null,"id":"7721acec","metadata":{},"outputs":[],"source":["encoder_max_length = 1024*16\n","decoder_max_length = 1024"]},{"cell_type":"code","execution_count":null,"id":"e2667bb4","metadata":{},"outputs":[],"source":["led = AutoModelForSeq2SeqLM.from_pretrained(model_name, gradient_checkpointing=True, use_cache=False)\n","# set generate hyperparameters\n","led.config.num_beams = 2\n","led.config.max_length = decoder_max_length\n","led.config.min_length = 256\n","led.config.early_stopping = True\n","led.config.no_repeat_ngram_size = 4\n"]},{"cell_type":"code","execution_count":null,"id":"8ebe2023","metadata":{},"outputs":[],"source":["led = AutoModelForSeq2SeqLM.from_pretrained(\"./final_model/IN_model\", use_cache=False)\n","led = led.to(\"cuda\")"]},{"cell_type":"code","execution_count":null,"id":"cd35ada2","metadata":{},"outputs":[],"source":["import torch\n","led.eval()\n","\n","def generate_summary_gpu(input_text, l):\n","\tinputs = tokenizer(\n","\t\tinput_text,\n","\t\tpadding=\"max_length\",\n","\t\ttruncation=True,\n","\t\tmax_length=encoder_max_length,\n","\t\treturn_tensors=\"pt\",\n","\t)\n","\tinput_ids = inputs.input_ids.to(led.device)\n","\tattention_mask = inputs.attention_mask.to(led.device)\n","\tglobal_attention_mask = torch.zeros_like(attention_mask)\n","\t# put global attention on token\n","\tglobal_attention_mask[:, 0] = 1\n","\tl = max(l,256)\n","\tl = min(l,1024)\n","\toutputs = led.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, max_length = l, num_beams = 2,repetition_penalty = 2.5,early_stopping = True)\t\n","\tpreds = [tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_space=True) for gen_id in outputs]\n","\treturn \"\".join(preds)"]},{"cell_type":"code","execution_count":null,"id":"59e1530e","metadata":{},"outputs":[],"source":["for i in range(len(data_source)):\n"," name = names[i]\n"," doc = data_source[i]\n"," wc = doc.split(\" \")\n"," input_len = len(wc)\n"," req_len = dict_names[name]\n"," print(str(i) + \": \" + name + \" - \" + str(input_len) + \" : \" + str(req_len))\n"," \n"," abs_summ = generate_summary_gpu(doc,req_len)\n"," if len(abs_summ.split(\" \")) > req_len:\n"," abs_summ = abs_summ.split(\" \")\n"," abs_summ = abs_summ[:req_len]\n"," abs_summ = \" \".join(abs_summ)\n"," print(abs_summ)\n"," print(len((abs_summ.split(\" \"))))\n"," path = output_path + name\n"," file = open(path,'w')\n"," file.write(abs_summ)\n"," file.close()"]}],"metadata":{"accelerator":"GPU","colab":{"collapsed_sections":[],"machine_shape":"hm","name":"pegasus_finetune.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5} 2 | -------------------------------------------------------------------------------- /abstractive/Legal-LED/Readme.md: -------------------------------------------------------------------------------- 1 | # Scripts and codes 2 | 3 | Legal-LED_generate_summary.ipynb - Script to generate summaries using Legal-LED Model 4 | 5 | Legal-LED_finetune.ipynb - Script to fine-tune Legal-LED Model 6 | 7 | # Requirements 8 | 9 | transformers 4.12.3 10 | 11 | pytorch 1.10 12 | 13 | # Usage 14 | 15 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py 16 | 2. Follow the instructions given in the notebooks 17 | -------------------------------------------------------------------------------- /abstractive/Legal-Pegasus/Readme.md: -------------------------------------------------------------------------------- 1 | # Scripts and codes 2 | 3 | pegasus-test.ipynb - Script to generate summaries using Pegasus Model 4 | 5 | pegasus-finetune.ipynb - Script to fine-tune Pegasus Model 6 | 7 | # Requirements 8 | 9 | transformers 4.12.3 10 | 11 | pytorch 1.10 12 | 13 | # Usage 14 | 15 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py 16 | 2. Follow the instructions given in the notebooks 17 | -------------------------------------------------------------------------------- /abstractive/Legal-Pegasus/pegasus-test.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","id":"8a842a0a","metadata":{"id":"8a842a0a"},"source":["### Script to generate summaries using chunking based Pegasus approach"]},{"cell_type":"code","execution_count":null,"id":"cb972436","metadata":{"id":"cb972436"},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext\n","output_path = \"./output/\""]},{"cell_type":"code","execution_count":null,"id":"1a83915a","metadata":{"id":"1a83915a"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import sys\n","sys.path.insert(0, '../')\n","from utilities import *\n","import os\n","import nltk"]},{"cell_type":"code","execution_count":null,"id":"049e7a6f","metadata":{"id":"049e7a6f"},"outputs":[],"source":["if not os.path.exists(output_path):\n"," os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"0591966a","metadata":{"id":"0591966a"},"outputs":[],"source":["#Reading the test documents\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))\n","len_dic = dict_names = get_req_len_dict(dataset, \"test\") "]},{"cell_type":"code","execution_count":null,"id":"14b68b70","metadata":{"id":"14b68b70"},"outputs":[],"source":["device = \"cuda:1\""]},{"cell_type":"code","execution_count":null,"id":"0596cefd","metadata":{"id":"0596cefd"},"outputs":[],"source":["# Loading Model and tokenizer\n","from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n","from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments\n","\n","tokenizer = AutoTokenizer.from_pretrained(\"nsi319/legal-pegasus\") \n","model = AutoModelForSeq2SeqLM.from_pretrained(\"nsi319/legal-pegasus\").to(device)\n"]},{"cell_type":"code","execution_count":null,"id":"49734483","metadata":{"id":"49734483"},"outputs":[],"source":["def summerize(text, max_len, min_len):\n"," '''\n"," Function to generate summary using Pegasus\n"," input: nested_sentences - chunks\n"," max_l - Maximum length\n"," min_l - Minimum length\n"," output: document summary\n"," '''\n"," try:\n"," input_tokenized = tokenizer.encode(text, return_tensors='pt',max_length=512,truncation=True).to(device)\n"," summary_ids = model.generate(input_tokenized,\n"," num_beams=9,\n"," length_penalty=0.1,\n"," min_length=min_len,\n"," max_length=max_len,\n"," )\n"," summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]\n"," return summary\n"," except:\n"," return \"\""]},{"cell_type":"code","execution_count":null,"id":"e7f05633","metadata":{"id":"e7f05633"},"outputs":[],"source":["def summerize_doc(nested_sentences, p):\n"," '''\n"," Function to generate summary using chunking based Pegasus\n"," input: nested_sentences - chunks\n"," p - Number of words in summaries per word in the document\n"," output: document summary\n"," '''\n"," device = 'cuda'\n"," result = []\n"," for nested in nested_sentences:\n"," l = int(p * len(nested.split(\" \")))\n"," max_len = l\n"," min_len = l-5\n"," result.append(summerize(nested, max_len, min_len))\n"," return result"]},{"cell_type":"code","execution_count":null,"id":"fd0385a1","metadata":{"id":"fd0385a1"},"outputs":[],"source":["done_files = glob.glob(output_path + \"*.txt\")\n","done_files = [i[i.rfind(\"/\")+1:] for i in done_files]"]},{"cell_type":"code","execution_count":null,"id":"0da2186e","metadata":{"id":"0da2186e"},"outputs":[],"source":["# main loop to generate and save summaries of each document in the test dataset\n","for i in range(len(data_source)):\n"," done_files = glob.glob(output_path + \"*.txt\")\n"," done_files = [i[i.rfind(\"/\")+1:] for i in done_files]\n"," name = names[i]\n"," if name in done_files:continue\n"," doc = data_source[i]\n"," input_len = len(doc.split(\" \"))\n"," req_len = dict_names[name]\n"," print(str(i) + \": \" + name + \" - \" + str(input_len) + \" : \" + str(req_len), end = \", \")\n"," \n"," nested = nest_sentences(doc,512)\n"," p = float(req_len/input_len)\n"," print(p)\n"," abs_summ = summerize_doc(nested,p)\n"," abs_summ = \" \".join(abs_summ)\n"," print(len((abs_summ.split(\" \"))))\n"," \n"," if len(abs_summ.split(\" \")) > req_len:\n"," abs_summ = abs_summ.split(\" \")\n"," abs_summ = abs_summ[:req_len]\n"," abs_summ = \" \".join(abs_summ)\n"," print(len((abs_summ.split(\" \"))))\n"," path = output_path + name\n"," file = open(path,'w')\n"," file.write(abs_summ)\n"," file.close()\n","# break"]},{"cell_type":"code","execution_count":null,"id":"33df9665","metadata":{"id":"33df9665"},"outputs":[],"source":[""]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.11"},"colab":{"name":"pegasus-test.ipynb","provenance":[],"collapsed_sections":[]}},"nbformat":4,"nbformat_minor":5} -------------------------------------------------------------------------------- /abstractive/Pointer Generator/chunker.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c4f4f89a", 6 | "metadata": {}, 7 | "source": [ 8 | "### Script to chunk text files\n", 9 | "\n", 10 | "Each chunk would have its own text file" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "d855afea", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import glob\n", 23 | "import nltk\n", 24 | "import os\n", 25 | "import sys\n", 26 | "sys.path.insert(0, '../')\n", 27 | "from utilities import *" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "4bc23a54", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "output_path = \"./chunked/\"\n", 38 | "if not os.path.exists(output_path):\n", 39 | " os.makedirs(output_path)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "a446e900", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "names, data_source, data_summary = get_summary_data(\"IN\", \"test\")\n", 50 | "print(len(names))\n", 51 | "print(len(data_source))\n", 52 | "print(len(data_summary))" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "366ba48f", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "output = []\n", 63 | "for i in range(len(data_source)):\n", 64 | " name = names[i]\n", 65 | " doc = data_source[i]\n", 66 | " wc = doc.split(\" \")\n", 67 | " print(str(i) + \": \" + name)\n", 68 | " nested = nest_sentences(doc,395)\n", 69 | " \n", 70 | " for i in range(len(nested)):\n", 71 | " path = output_path + name[:-4] + \"_\" + str(i) + \".txt\"\n", 72 | " print(path)\n", 73 | " file = open(path,'w')\n", 74 | " file.write(nested[i])\n", 75 | " file.close()\n", 76 | " \n", 77 | "print(output)" 78 | ] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 3", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.7.11" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 5 102 | } 103 | -------------------------------------------------------------------------------- /abstractive/Pointer Generator/combiner.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ee01a304", 6 | "metadata": {}, 7 | "source": [ 8 | "### Script to combine summaries generated from each chunk for each file\n", 9 | "\n", 10 | "Change the input and output path according to requirements" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "02fdfddd", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import os\n", 21 | "input_path = \"./chunked/\"\n", 22 | "output_path = \"./result/\"\n", 23 | "if not os.path.exists(output_path):\n", 24 | " os.makedirs(output_path)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "7bafb697", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import pandas as pd\n", 35 | "import numpy as np\n", 36 | "import glob\n", 37 | "import sys\n", 38 | "sys.path.insert(0, '../')\n", 39 | "from utilities import *" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "ed178de7", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Reading every chunk and combining summaries to get the final summaries\n", 50 | "from collections import defaultdict\n", 51 | "\n", 52 | "path = input_path\n", 53 | "all_files = glob.glob(path + \"/*.txt\")\n", 54 | "\n", 55 | "chunk_sum = defaultdict(list)\n", 56 | "\n", 57 | "for filename in all_files:\n", 58 | " with open(filename, 'r') as f: \n", 59 | " p = filename.rfind(\"/\")\n", 60 | " name = (filename[p+1:])\n", 61 | " chunk_number = int(name[name.rfind(\"_\")+1:name.rfind(\".\")])\n", 62 | " name = name[:name.rfind(\"_\")] + \".txt\"\n", 63 | " chunk_sum[name].append((chunk_number, f.read()))\n", 64 | "# print(chunk_number)\n", 65 | "# break" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "e42a54d2", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "len(chunk_sum)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "610b7898", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "dict_names = get_req_len_dict(\"IN\", \"test\") " 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "cbf4995d", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "\n", 96 | "# Writing the resultant summaries to text files\n", 97 | "for filename in chunk_sum.keys():\n", 98 | "# print(filename)\n", 99 | "# print(chunk_sum[filename])\n", 100 | " sorted_chunks = (sorted(chunk_sum[filename], key=lambda x: x[0]))\n", 101 | " summary = ''\n", 102 | " for i in sorted_chunks:\n", 103 | " summary = summary + i[1]\n", 104 | "# print(summary)\n", 105 | " generated_length = len(summary.split(\" \"))\n", 106 | " required_length = dict_names[filename]\n", 107 | " if generated_length > required_length:\n", 108 | " summary = summary.split(\" \")\n", 109 | " summary = summary[:required_length]\n", 110 | " summary = \" \".join(summary)\n", 111 | " path = output_path + filename\n", 112 | " file = open(path,'w')\n", 113 | " file.write(summary)\n", 114 | " file.close()\n", 115 | "# break" 116 | ] 117 | } 118 | ], 119 | "metadata": { 120 | "kernelspec": { 121 | "display_name": "Python 3.9.13 64-bit (microsoft store)", 122 | "language": "python", 123 | "name": "python3" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": { 127 | "name": "ipython", 128 | "version": 3 129 | }, 130 | "file_extension": ".py", 131 | "mimetype": "text/x-python", 132 | "name": "python", 133 | "nbconvert_exporter": "python", 134 | "pygments_lexer": "ipython3", 135 | "version": "3.9.13" 136 | }, 137 | "vscode": { 138 | "interpreter": { 139 | "hash": "8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5" 140 | } 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 5 145 | } 146 | -------------------------------------------------------------------------------- /abstractive/Pointer Generator/readme.md: -------------------------------------------------------------------------------- 1 | # Scripts and codes 2 | 3 | pointer-generator - Repository - https://github.com/abisee/pointer-generator 4 | 5 | chunker.ipynb - Script to divide a text file into chunks. Here each chunk will be saved in a text file. 6 | 7 | combiner.ipynb - Script to combine the summaries generated for each chunk in a text file into the final summary 8 | 9 | # Usage 10 | 11 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py 12 | 2. Use chunker to chunk the input text files into chunk text files. 13 | 3. Use the pointer generator method to summarize each chunk. 14 | 4. Use the combiner to combine the summaries for every chunk to generate the final summaries. 15 | -------------------------------------------------------------------------------- /abstractive/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This repository contains scripts related to *Abstractive Summarization* algorithms for legal case documents. 4 | 5 | | Directory | Description | 6 | | ------------- | ------------- | 7 | | BART based approaches | Scripts to fine-tune BART and generate summaries using BART_RR, BART and BERT-BART method | 8 | | BertSum-Abs | Implementation of BERTSum-abs and helper scripts | 9 | | Generate fine-tuning data | Scripts to generate fine tuning data using CLS, MCS, SIF and MCS_RR methods | 10 | | Legal-LED | Scripts to fine-tune and generate summaries using Legal-LED | 11 | | Legal-Pegasus | Scripts to fine-tune and generate summaries using Pegasus | 12 | | Pointer Generator | Implementation of Pointer-Generator and helper scripts | 13 | 14 | 15 | # Availability of Trained models 16 | 17 | The following models trained on the IN-Abs & UK-Abs datasets, are available at https://zenodo.org/record/7234359#.Y2tShdJByC1 18 | 19 | - Legal-LED 20 | - Legal-Pegasus 21 | -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This folder contains dataset of the paper *Legal Case Document Summarization: Extractive and Abstractive Methods and their Evaluation* accepted at AACL-IJCNLP 2022. The dataset repository can be downloaded from https://zenodo.org/record/7152317#.Yz6mJ9JByC0 . 4 | 5 | There are 3 datasets : 6 | - IN-Abs : Indian Supreme Court case documents & their *abstractive* summaries, obtained from http://www.liiofindia.org/in/cases/cen/INSC/ 7 | - IN-Ext : Indian Supreme Court case documents & their *extractive* summaries, written by two law experts (A1, A2). 8 | - UK-Abs : United Kingdom (U.K.) Supreme Court case documents & their *abstractive* summaries, obtained from https://www.supremecourt.uk/decided-cases/ 9 | 10 | # Details of each dataset 11 | 12 | ## IN-Abs 13 | We crawled 7,130 full case documents and their corresponding abstractive summaries from http://www.liiofindia.org/in/cases/cen/INSC/ . 14 | Among them, 7030 (document, summary) pairs are randomly sampled as the training dataset. The remaining 100 (document, summary) 15 | pairs are considered as the test set. The directory structure is as follows : 16 | 17 | 18 | . 19 | ├── train-data # folder contains documents and summaries for training 20 | │ ├── judgement 21 | │ ├── summary 22 | │ ├── stats-IN-train.txt # text file containing the word and sentence count statistics of the documents 23 | └── test-data # folder contains documents and summaries for test 24 | │ ├── judgement 25 | │ ├── summary 26 | │ ├── stats-IN-test.txt # text file containing the word and sentence counts of the judgements & the summaries 27 | 28 | ## IN-Ext 29 | 30 | This dataset contains 50 Indian Supreme Court case documents and their extractive summaries. For each document, there are two summaries written individually by two law experts A1 and A2. Each summary by each law expert is of two types : full & segment-wise. 31 | 32 | - "full" : a coherent piece of summary. 33 | 34 | - "segment-wise" : the following segments are considered -- 'analysis', 'argument', 'facts', 'judgement', 'statute'. A "full" summary is broken down into these segments. Each segment is a folder. 35 | 36 | More details can be found in the associated README file of the dataset repository. 37 | 38 | The directory structure is as follows : 39 | 40 | 41 | . 42 | ├── judgement # folder contains documents 43 | ├── summary 44 | │ ├── full # folder contains full summaries 45 | │ │ ├── A1 46 | │ │ ├── A2 47 | │ ├── segment-wise # folder contains segment-wise summaries 48 | │ │ ├── A1 49 | │ │ ├── A2 50 | ├── IN-EXT-length.txt # text file containing the word and sentence counts of the judgements & the summaries 51 | 52 | ## UK-Abs 53 | We crawled 793 full case documents and their corresponding abstractive summaries from https://www.supremecourt.uk/decided-cases/ . Among them, 693 (document, summary) pairs are randomly sampled as the training dataset. The remaining 100 (document, summary) pairs are considered as the test set. 54 | 55 | Similar to IN-Ext, the summaries in the dataset are of two types -- full and segment-wise. 56 | 57 | The segments in the UK dataset are 'background' , 'judgement' and 'reasons'. The test-data folder contains these segment-wise and full summaries. 58 | 59 | The directory structure is as follows : 60 | 61 | 62 | . 63 | ├── train-data # folder contains documents and summaries for training 64 | │ ├── judgement 65 | │ ├── summary 66 | │ ├── stats-UK-train.txt # text file containing the word and sentence count statistics of the documents 67 | └── test-data # folder contains documents and summaries for test 68 | │ ├── judgement 69 | │ ├── summary 70 | │ │ ├── full # folder contains full summaries 71 | │ │ ├── segment-wise # folder contains segment-wise summaries 72 | │ ├── stats-UK-test.txt # text file containing the word and sentence counts of the judgements & the summaries 73 | 74 | # Citation 75 | If you are using the dataset, please refer to the following paper: 76 | ``` 77 | @inproceedings{bhattacharya2021, 78 | title={Legal Case Document Summarization: Extractive and Abstractive Methods and their Evaluation}, 79 | author={Shukla, Abhay and Bhattacharya, Paheli and Poddar, Soham and Mukherjee, Rajdeep and Ghosh, Kripabandhu and Goyal, Pawan and Ghosh, Saptarshi}, 80 | booktitle={The 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing}, 81 | year={2022} 82 | } 83 | 84 | -------------------------------------------------------------------------------- /extractive/CaseSummarizer/README.md: -------------------------------------------------------------------------------- 1 | # CaseSummarizer 2 | 3 | Implementation of the paper [CaseSummarizer](http://www.aclweb.org/anthology/C16-2054), COLING 2016 4 | 5 | A sample input file, along with its pre-processing output and summary is present in the sample folders. 6 | 7 | ### (Optional) Preprocess the documents 8 | Description : Tokenize, lemmatize, remove stopwords 9 | 10 | #### Run : 11 | ``` 12 | python preprocess.py path/to/input/folder/ path/to/output/folder 13 | ``` 14 | 15 | ### Generate Summary : 16 | 17 | #### Run : 18 | ``` 19 | python summary.py path/to/input/folder/ path/to/output/folder/ path/to/dictionary.txt/file/ 20 | python summary_length.py path/to/original/doc/folder/ path/to/output/folder/from/summary.py/ path/to/output/folder/ fraction/of/original/text/length/as/summary 21 | ``` 22 | 23 | ### Prerequisites 24 | * [Python 2.7+](https://www.python.org/download/releases/2.7/) 25 | * [Numpy](http://www.numpy.org/) 26 | * [NLTK](http://www.nltk.org/) 27 | -------------------------------------------------------------------------------- /extractive/CaseSummarizer/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/CaseSummarizer/preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 15 09:50:36 2019 4 | 5 | @author: Paheli 6 | """ 7 | 8 | #!/usr/bin/env python2 9 | # -*- coding: utf-8 -*- 10 | """ 11 | Created on Tue Sep 18 14:49:13 2018 12 | 13 | @author: user 14 | """ 15 | import sys 16 | import re 17 | import os 18 | 19 | path_r = sys.argv[1] 20 | path_w = sys.argv[2] 21 | #path_r = "C:\\Users\\Paheli\\Downloads\\temp" 22 | #path_w = "C:\\Users\\Paheli\\Downloads\\temp_proc" 23 | 24 | for f in os.listdir(path_r): 25 | 26 | fileName = os.path.join(path_r,f) 27 | file = open(fileName, 'r') 28 | 29 | documents = [] 30 | documents = (file.read()).strip().split("\n"); 31 | file.close() 32 | 33 | for i in range(0,len(documents)): 34 | line = documents[i] 35 | if line.startswith("1."): 36 | break 37 | doc = documents[i:] 38 | temp = "" 39 | for eachDocument in doc[:]: 40 | eachDocument = re.sub(r'(\d\d\d|\d\d|\d)\.\s', ' ', eachDocument)#removes the paragraph lables 1. or 2. etc. 41 | eachDocument = re.sub(r'(?<=[a-zA-Z])\.(?=\d)', '', eachDocument)#removes dot(.) i.e File No.1063 42 | eachDocument = re.sub(r'(?<=\d|[a-zA-Z])\.(?=\s[\da-z])', ' ', eachDocument)#to remove the ending dot of abbr 43 | eachDocument = re.sub(r'(?<=\d|[a-zA-Z])\.(?=\s?[\!\"\#\$\%\&\'\(\)\*\+\,\-\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~])', '', eachDocument)#to remove the ending dot of abbr 44 | eachDocument = re.sub(r'(?\?\@\[\\\]\^\_\`\{\|\}\~]', ' ', eachDocument)#removes the other punctuations 45 | 46 | temp = temp +''+eachDocument 47 | documents = [] 48 | temp = temp.replace(" "," ") 49 | documents = temp.replace(" ","",1) 50 | 51 | file_w = open(os.path.join(path_w,f),"w") 52 | file_w.write(documents) 53 | file_w.close() -------------------------------------------------------------------------------- /extractive/CaseSummarizer/sample folder/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/CaseSummarizer/sample_processed/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/CaseSummarizer/sample_shortsumm/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/CaseSummarizer/sample_shortsumm/sample input.txt: -------------------------------------------------------------------------------- 1 | But under the Income tax Act the position is somewhat different It appears that a fresh partnership deed was drawn up in the year 1945 when Gilbert was brought in The appeal therefore fails and is dismissed with costs In 1924 Mathews went out and his share was taken over by Figgies and Notley The Tribunal in spite of this document took the view that under the Partnership Act a firm could be carried on even if there was a change in its constitution The High Court refused to look into this document as it had not been relied upon before the Tribunal and no reference bad been specifically made to it in the order of the Income tax Officer or the Assistant Commissioner It is true that under the law of partnership a firm has no legal existence apart from its partners and it is merely a compendious name to describe its partners but it is also equally true that under that law there is no dissolution of the firm by the mere incoming or outgoing of partners Reference was made by Mr Daphtary to the partnership deed drawn up in 1 It was argued that a different firm was then constituted The true question to decide is one of identity of the unit assessed under the Income tax Act 1918 which paid double tax in the year 1939 with the unit to whose business the private limited company succeeded in the year 1 We have no doubt that the Tribunal and the High Court were right in holding that in spite of the mere changes in the constitution of the firm the business of the firm as originally constituted continued as tea brokers right from its inception till the time it was succeeded by the limited company and that it was the same unit all through carrying on the same business at the same place and there was no cesser of that business or any change in the unit The assessee is a partnership concern It upheld the view taken by the Tribunal A firm can be charged as a distinct assessable entity as distinct from its partners who can also be assessed individually Sections 26 48 and 55 of the Act fully bear out this position In 1939 Hillman was brought in and the partnership consisted of these three partners The Income tax Officer disallowed the claim of the assessee on the ground that the partners of the firm in 1939 being different from the partners of the firm in 1947 no relief could be given to the applicant Section 3 which is the charging section is in these terms Where any Central Act enacts that income tax shall be charged for any year at any rate or rates tax at that rate or those rates shall be charged for that year in accordance with and subject to the provisions of this Act in respect of the total income of the previous year of every individual Hindu undivided family company and local authority and of every firm and other association of persons or the partners of the firm or the members of the association individually The partners of the firm are distinct assessable entities while the firm as such is a separate and distinct unit for purposes of assessment The result is that we see no substantial grounds for disturbing the opinion given by the High Court on the question submitted to it The law with respect to retiring partners as enacted in the Partnership Act is to a certain extent a -------------------------------------------------------------------------------- /extractive/CaseSummarizer/sample_summary/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/CaseSummarizer/summary_length.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Sep 18 15:41:27 2018 5 | 6 | @author: user 7 | """ 8 | import string 9 | import sys 10 | import math 11 | import os 12 | 13 | count = 0 14 | path_orig = sys.argv[1] 15 | path_sum = sys.argv[2] 16 | path_w = sys.argv[3] 17 | length = float(sys.argv[4]) 18 | 19 | for f in os.listdir(path_orig): 20 | url_orig = os.path.join(path_orig,f) 21 | url_sum = os.path.join(path_sum,f) 22 | 23 | count+=1 24 | print count 25 | 26 | fr = open(url_orig,"r") 27 | 28 | doc_length = 0 29 | for line in fr.readlines(): 30 | line = line.rstrip("\n") 31 | line = line.translate(None,string.punctuation) 32 | ls = line.split(" ") 33 | doc_length = doc_length+len(ls) 34 | 35 | summ_length = math.floor(length*float(doc_length)) 36 | print doc_length,summ_length 37 | 38 | 39 | output = "" 40 | fr = open(url_sum,"r") 41 | rem_length = summ_length 42 | for line in fr.readlines(): 43 | # line = line.rstrip("\n") 44 | line2 = line.translate(None,string.punctuation) 45 | ls = line2.split(" ") 46 | if len(ls) < rem_length: 47 | output+=line 48 | rem_length = rem_length-len(ls) 49 | #print rem_length 50 | if len(ls) > rem_length: 51 | last = ' '.join(ls[:int(rem_length)]) 52 | output+=last 53 | 54 | file_w = open(os.path.join(path_w,f),"w") 55 | file_w.write(output) 56 | file_w.close() -------------------------------------------------------------------------------- /extractive/Gist/README.md: -------------------------------------------------------------------------------- 1 | ### Gist 2 | 3 | Implementation of the paper [Extracting the Gist of Chinese Judgments of the Supreme Court](https://dl.acm.org/doi/10.1145/3322640.3326715), ICAIL 2019 4 | 5 | Like all extractive supervised summarization methods, the training data for the model should be sentences labelled 0 (not a summary sentence) or 1 (summary sentence). 6 | 7 | We may use the [extractive_labels.py](https://github.com/Law-AI/summarization/blob/aacl/extractive/abs_to_ext/extractive_labels.py) code to create this training data. 8 | 9 | ### Generate the handcrafted features for the training set 10 | 11 | `python generate_features.py --data_path /path/to/train/documents/ --features_path /outpath/of/train/features/ --important_words cue_phrases.txt --pos_tags postags.txt --w2v True` 12 | [if a pretrained word2vec model is not present, a model is trained based on the training set corpus] 13 | 14 | `python generate_features.py --data_path /path/to/train/documents/ --features_path /outpath/of/train/features/ --important_words cue_phrases.txt --pos_tags postags.txt --word2vec_path /path/to/trained/word2vec/model.bin` 15 | [if a pretrained word2vec model is present, mention its path] 16 | 17 | ### Train the LGBM classifier 18 | `python train.py --data_path /path/to/train/documents/ --features_path /outpath/of/train/features/ --model_path /path/to/save/learned/model/` 19 | 20 | *To view other options (show help):* 21 | `python train.py -h` 22 | 23 | 24 | ### Generate the handcrafted features for test set 25 | 26 | `python generate_features.py --data_path /path/to/test/documents/ --features_path /outpath/of/test/features/ --important_words cue_phrases.txt --pos_tags postags.txt --word2vec_path /path/to/trained/word2vec/model.bin` 27 | 28 | ### Infer summaries using the trained LGBM classifier 29 | 30 | `python infer.py --data_path /path/to/test/documents/ --features_path /outpath/of/test/features/ --summary_path /path/to/save/summaries/ --model_path /path/to/trained/model.pkl --length_file /path/to/summary/length/file` 31 | The prediction probability of each sentence in stored in /outpath/of/test/features/pred_scores/ [this directory is automatically created] 32 | 33 | - The trained model has been made publicly available at https://zenodo.org/record/7234359#.Y2fx09JBy-o 34 | 35 | 36 | ### format of length file 37 | 38 | ``` 39 | filename required-summary-length-in-words 40 | ``` 41 | ### External libraries required 42 | 43 | - lightgbm = 2.3.1 44 | - sklearn = 0.21.3 45 | - gensim = 3.4.0 46 | - tqdm 47 | -------------------------------------------------------------------------------- /extractive/Gist/codes/generate_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | from codes import word2vec, sent_pos, quant_feats, imp_words, word_emb, pos_tags, open_words 3 | 4 | 5 | def features(args): 6 | 7 | data_folder = args.data_path 8 | 9 | handcrafted_features = args.important_words 10 | postags = args.pos_tags 11 | 12 | 13 | if not os.path.exists(args.features_path): 14 | os.mkdir(args.features_path) 15 | 16 | 17 | if args.w2v: 18 | word2vec.train_word2vec(args.data_path,args.features_path) 19 | w2v_model = args.features_path+"word2vec_model.bin" 20 | else: 21 | w2v_model = args.word2vec_path 22 | 23 | 24 | 25 | sent_pos_folder = args.features_path+"sent-pos\\" 26 | if not os.path.exists(sent_pos_folder): 27 | os.mkdir(sent_pos_folder) 28 | 29 | print("\n\nsentence postition") 30 | sent_pos.run_code(data_folder, sent_pos_folder) 31 | 32 | print("\n\nQuantitative Features") 33 | quant_feats.run_code(data_folder, sent_pos_folder, args.features_path+"features_quant") 34 | 35 | print("\n\n Important Words Features") 36 | imp_words.run_code(handcrafted_features, data_folder, args.features_path+"features_impwords") 37 | 38 | print("\n\n Word Embedding Features") 39 | word_emb.run_code(w2v_model, data_folder, args.features_path+"features_wordemb") 40 | 41 | print("\n\n Part-of-Speech Features") 42 | pos_tags.run_code(postags, data_folder, args.features_path+"features_pos") 43 | 44 | print("\n\n Open Word Features") 45 | open_words.run_code(w2v_model, data_folder, args.features_path+"features_openword") 46 | -------------------------------------------------------------------------------- /extractive/Gist/codes/generate_summary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Aug 31 09:54:03 2020 4 | 5 | @author: Paheli 6 | """ 7 | # -*- coding: utf-8 -*- 8 | """ 9 | Created on Tue Apr 7 20:19:23 2020 10 | 11 | @author: Paheli 12 | """ 13 | import joblib 14 | import pandas as pd 15 | import os 16 | from tqdm import tqdm 17 | import spacy 18 | import string 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | nlp = spacy.load('en_core_web_sm') 22 | 23 | 24 | def count_word(text): 25 | doc = nlp(text) 26 | tokens = [t.text for t in doc] 27 | tokens = [t for t in tokens if len(t.translate(t.maketrans('', '', string.punctuation + string.whitespace))) > 0] # + string.digits 28 | #print(tokens) 29 | 30 | return len(tokens) 31 | 32 | 33 | def summary(args): 34 | 35 | summary_length_file = args.length_file 36 | test_data_feats = args.features_path 37 | test_data_text = args.data_path 38 | 39 | 40 | 41 | summary_labelled_path = os.path.join(args.features_path+"pred_scores") 42 | if not os.path.exists(summary_labelled_path): 43 | os.mkdir(summary_labelled_path) 44 | 45 | if not os.path.exists(args.summary_path): 46 | os.mkdir(args.summary_path) 47 | 48 | 49 | features_a1a2 = test_data_feats+"features_quant\\" 50 | features_a3 = test_data_feats+"features_impwords\\" 51 | features_a4 = test_data_feats+"features_wordemb\\" 52 | features_a5 = test_data_feats+"features_pos\\" 53 | features_a6 = test_data_feats+"features_openword\\" 54 | 55 | 56 | #for f in tqdm(folds) : 57 | 58 | lgbm_pickle = joblib.load(args.model_path) 59 | 60 | fr_len = open(args.length_file,"r") 61 | 62 | for line in tqdm(fr_len.readlines()): 63 | line = line.rstrip("\n") 64 | ls = line.split("\t") 65 | f = ls[0] 66 | reqd_length = int(ls[1]) 67 | 68 | 69 | f_a1a2 = os.path.join(features_a1a2,f) 70 | f_a3 = os.path.join(features_a3,f) 71 | f_a4 = os.path.join(features_a4,f) 72 | f_a5 = os.path.join(features_a5,f) 73 | f_a6 = os.path.join(features_a6,f) 74 | 75 | 76 | 77 | df_a1a2 = pd.read_table(f_a1a2,delimiter=' ',header=None) 78 | df_a3 = pd.read_table(f_a3,delimiter=' ',header=None) 79 | 80 | df_a4 = pd.read_table(f_a4,delimiter=' ',header=None) 81 | df_a5 = pd.read_table(f_a5,delimiter=' ',header=None) 82 | 83 | df_a6 = pd.read_table(f_a6,delimiter=' ',header=None) 84 | 85 | 86 | 87 | df_features = pd.concat([df_a1a2,df_a3,df_a4,df_a5,df_a6],axis=1) 88 | 89 | sc = StandardScaler() 90 | df_features = sc.fit_transform(df_features) 91 | 92 | 93 | fw = open(os.path.join(summary_labelled_path,f),"w") 94 | ypred=lgbm_pickle.predict_proba(df_features) 95 | i=-1 96 | summary_scores = {} 97 | sentence_pos = {} 98 | 99 | fr = open(os.path.join(test_data_text,f),"r") 100 | 101 | for line in fr.readlines(): 102 | line = line.rstrip("\n") 103 | i+=1 104 | pred = ypred[i] 105 | sentence_pos[line] = i 106 | summary_scores[line] = pred[1] 107 | 108 | fw.write(line+"$$$"+str(pred[1])+"\n") 109 | #if (i==19): 110 | # break 111 | fw.close() 112 | fr.close() 113 | 114 | sort_sum = sorted(summary_scores.items(), key=lambda item: item[1], reverse = True) 115 | sort_len = sorted(sentence_pos.items(), key=lambda item: item[1], reverse = False) 116 | 117 | shortlist = [] 118 | 119 | 120 | fw = open(os.path.join(args.summary_path,f),"w") 121 | length_so_far = 0 122 | for tup in sort_sum: 123 | sent = tup[0] 124 | length = count_word(sent) 125 | if length_so_far+length>reqd_length: 126 | break 127 | else: 128 | length_so_far+=length 129 | shortlist.append(sent) 130 | 131 | for tup in sort_len: 132 | sent = tup[0] 133 | if sent in shortlist: 134 | fw.write(sent+"\n") 135 | fw.close() 136 | print(f,reqd_length,length_so_far) 137 | 138 | fr_len.close() 139 | 140 | 141 | -------------------------------------------------------------------------------- /extractive/Gist/codes/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/Gist/codes/imp_words.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 7 20:40:16 2020 4 | 5 | @author: Paheli 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | """ 10 | Created on Tue Apr 7 16:57:03 2020 11 | 12 | @author: Paheli 13 | """ 14 | import numpy as np 15 | from tqdm import tqdm 16 | import os 17 | 18 | def run_code(hcf_features,data_folder,features_folder): 19 | fr1 = open(hcf_features,"r") 20 | if not os.path.exists(features_folder): 21 | os.mkdir(features_folder) 22 | features = [] 23 | for line in fr1.readlines(): 24 | line = line.rstrip("\n") 25 | line = line.lower() 26 | features.append(line) 27 | 28 | for f in tqdm(os.listdir(data_folder)): 29 | file = os.path.join(data_folder,f) 30 | fr2 = open(file,"r") 31 | fw = open(os.path.join(features_folder,f),"w") 32 | 33 | for sentence in fr2.readlines(): 34 | sentence = sentence.rstrip("\n") 35 | if "$$$" in sentence: 36 | sls = sentence.split("$$$") 37 | sentence = sls[0] 38 | 39 | onehot = np.zeros(94) 40 | for i in range(0,94): 41 | f = features[i] 42 | if f.lower() in sentence.lower(): 43 | onehot[i] = 1 44 | fw.write(" ".join(map(str, onehot))+"\n") 45 | 46 | fw.close() 47 | 48 | -------------------------------------------------------------------------------- /extractive/Gist/codes/open_words.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 15 14:03:30 2020 4 | 5 | @author: Paheli 6 | """ 7 | from gensim.models import Word2Vec 8 | import numpy as np 9 | from nltk import word_tokenize 10 | from tqdm import tqdm 11 | import os 12 | 13 | def run_code(model_path, data_folder, features_folder): 14 | model = Word2Vec.load(model_path) 15 | if not os.path.exists(features_folder): 16 | os.mkdir(features_folder) 17 | #pathr = "F:\\Summarization\\ChineseGist\\NEW SETUP\\TEST\\test_processed\\processed\\" 18 | #pathw = "F:\\Summarization\\ChineseGist\\NEW SETUP\\features_a6\\" 19 | 20 | for f in tqdm(os.listdir(data_folder)): 21 | file = os.path.join(data_folder,f) 22 | fr = open(file,"r") 23 | fw = open(os.path.join(features_folder,f),"w") 24 | for sentence in fr.readlines(): 25 | sentence = sentence.rstrip("\n") 26 | if "$$$" in sentence: 27 | sls = sentence.split("$$$") 28 | sentence = sls[0] 29 | summation = np.zeros(100) 30 | length=0 31 | tokens = word_tokenize(sentence) 32 | openwords = tokens[:5] 33 | for word in openwords: 34 | if word in model.wv.vocab: 35 | vec = model.wv[word] 36 | summation = np.add(summation,vec) 37 | length+=1 38 | 39 | if length > 0 : summation = summation/length 40 | summation = np.around(summation,decimals=4) 41 | fw.write(" ".join(map(str, summation))+"\n") 42 | 43 | fw.close() -------------------------------------------------------------------------------- /extractive/Gist/codes/pos_tags.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 15 12:16:56 2020 4 | 5 | @author: Paheli 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | """ 10 | Created on Tue Apr 7 23:11:24 2020 11 | 12 | @author: Paheli 13 | """ 14 | 15 | # -*- coding: utf-8 -*- 16 | """ 17 | Created on Tue Apr 7 16:44:29 2020 18 | 19 | @author: Paheli 20 | """ 21 | import numpy as np 22 | from tqdm import tqdm 23 | import nltk 24 | import os 25 | 26 | def run_code(postags, data_folder, features_folder): 27 | 28 | fr1 = open(postags,"r") 29 | if not os.path.exists(features_folder): 30 | os.mkdir(features_folder) 31 | features = [] 32 | for line in fr1.readlines(): 33 | line = line.rstrip("\n") 34 | features.append(line) 35 | 36 | 37 | 38 | for f in tqdm(os.listdir(data_folder)): 39 | file = os.path.join(data_folder,f) 40 | fr = open(file,"r") 41 | fw = open(os.path.join(features_folder,f),"w") 42 | for sentence in fr.readlines(): 43 | sentence = sentence.rstrip("\n") 44 | if "$$$" in sentence: 45 | sls = sentence.split("$$$") 46 | sentence = sls[0] 47 | tokens = nltk.word_tokenize(sentence) 48 | postags = nltk.pos_tag(tokens) 49 | ptags = np.zeros(35) 50 | count=0 51 | if len(postags)>=10: 52 | for pair in postags[:10]: 53 | onehot = np.zeros(35) 54 | ptag = pair[1] 55 | if ptag in features: 56 | index = features.index(ptag) 57 | onehot[index] = 1 58 | ptags = np.add(ptags,onehot) 59 | count+=1 60 | 61 | else: 62 | x = len(postags) 63 | for pair in postags[:x]: 64 | onehot = np.zeros(35) 65 | ptag = pair[1] 66 | if ptag in features: 67 | index = features.index(ptag) 68 | onehot[index] = 1 69 | ptags = np.add(ptags,onehot) 70 | count+=1 71 | ptags = ptags/count 72 | 73 | #print (ptags) 74 | fw.write(" ".join(map(str, ptags))+"\n") 75 | 76 | fw.close() 77 | 78 | 79 | -------------------------------------------------------------------------------- /extractive/Gist/codes/quant_feats.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 7 19:22:57 2020 4 | 5 | @author: Paheli 6 | """ 7 | import string 8 | from nltk import word_tokenize 9 | import os 10 | import numpy as np 11 | from tqdm import tqdm 12 | 13 | def feature_chars(sentence): 14 | chars = 0 15 | for n in sentence: 16 | chars+=1 17 | 18 | return chars 19 | 20 | def feature_words(sentence): 21 | words = len(word_tokenize(sentence)) 22 | return words 23 | 24 | def feature_uniqwords(sentence): 25 | w = word_tokenize(sentence) 26 | uniqwords=len(list(dict.fromkeys(w))) 27 | 28 | return uniqwords 29 | 30 | def feature_position(positions1,positions2,sentence): 31 | p1 = positions1.get(sentence) 32 | p2 = positions2.get(sentence) 33 | return p1,p2 34 | 35 | 36 | def run_code(data_folder,data_folder_sent_pos,features_folder): 37 | 38 | #data_folder_sent_pos= "F:\\Summarization\\ChineseGist\\NEW SETUP\\sent-pos\\" 39 | #data_folder = "F:\\Summarization\\ChineseGist\\NEW SETUP\\test_processed\\processed\\" 40 | #pathw = "F:\\Summarization\\ChineseGist\\NEW SETUP\\features_a1a2\\" 41 | if not os.path.exists(features_folder): 42 | os.mkdir(features_folder) 43 | 44 | 45 | positions1= {} 46 | positions2 = {} 47 | for file in os.listdir(data_folder_sent_pos): 48 | fr = open(os.path.join(data_folder_sent_pos,file),"r") 49 | for line in fr.readlines(): 50 | line = line.rstrip("\n") 51 | if "$$$" in line: 52 | ls = line.split("$$$") 53 | sent = ls[0].lstrip(" ").rstrip(" ") 54 | else: 55 | sent = line 56 | #sent = sent.translate(str.maketrans('', '', string.punctuation)) 57 | pos1 = ls[1].lstrip(" ").rstrip(" ") 58 | pos2 = ls[2].lstrip(" ").rstrip(" ") 59 | positions1[file+"///"+sent] = pos1 60 | positions2[file+"///"+sent] = pos2 61 | 62 | for file in tqdm(os.listdir(data_folder)): 63 | f = os.path.join(data_folder,file) 64 | fr1 = open(f,"r") 65 | fw = open(os.path.join(features_folder,file),"w") 66 | for line in fr1.readlines(): 67 | line = line.rstrip("\n") 68 | if "$$$" in line: 69 | ls = line.split("$$$") 70 | line = ls[0] 71 | 72 | line_orig = line.rstrip(" ") 73 | line = line.translate(str.maketrans('', '', string.punctuation)) 74 | s1 = int(feature_chars(line)) 75 | s2 = int(feature_words(line)) 76 | s3 = int(feature_uniqwords(line)) 77 | s4, s5 = feature_position(positions1,positions2,file+"///"+line_orig) 78 | 79 | sent = [s1,s2,s3,int(s4),float(s5)] 80 | 81 | sent = np.array(sent) 82 | 83 | sent = [s1,s2,s3,int(s4),float(s5)] 84 | sent = np.array(sent) 85 | 86 | fw.write(" ".join(map(str, sent))+"\n") 87 | fw.close() 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /extractive/Gist/codes/sent_pos.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri May 8 19:29:28 2020 4 | 5 | @author: Paheli 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | """ 10 | Created on Fri Apr 3 11:35:13 2020 11 | 12 | @author: Paheli 13 | """ 14 | import os 15 | from tqdm import tqdm 16 | 17 | def run_code(data_folder,sent_pos_folder): 18 | #data_folder = "F:\\Summarization\\ChineseGist\\NEW SETUP\\test_processed\\processed\\" 19 | #pathw = "F:\\Summarization\\ChineseGist\\NEW SETUP\\sent-pos\\" 20 | 21 | for file in tqdm(os.listdir(data_folder)): 22 | sent_pos = [] 23 | total_lines=0 24 | total_lines_list = [] 25 | fr = open(os.path.join(data_folder,file),"r") 26 | fw = open(os.path.join(sent_pos_folder,file),"w") 27 | 28 | for line in fr.readlines(): 29 | line = line.rstrip("\n") 30 | #ls = line.split("$$$") 31 | sent = line.lstrip(" ").rstrip(" ") 32 | total_lines+=1 33 | sent_pos.append(sent) 34 | total_lines_list.append(total_lines) 35 | #print(line) 36 | 37 | for i in range(0,len(sent_pos)): 38 | v = total_lines_list[i] 39 | k = sent_pos[i] 40 | rel = float(v)/float(total_lines) 41 | g = float("{:.5f}".format(rel)) 42 | fw.write(k+" $$$ "+ str(v)+ " $$$ " +str(g)+"\n") 43 | fw.close() 44 | fr.close() 45 | -------------------------------------------------------------------------------- /extractive/Gist/codes/train_classifier.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | from sklearn.preprocessing import StandardScaler 4 | import joblib,os 5 | from sklearn import model_selection 6 | from lightgbm import LGBMClassifier 7 | 8 | def train_lgbm(args): 9 | 10 | filelist = os.listdir(args.data_path) 11 | 12 | df_a1a2 = pd.DataFrame() 13 | df_a3 = pd.DataFrame() 14 | df_a4 = pd.DataFrame() 15 | df_a5 = pd.DataFrame() 16 | df_a6 = pd.DataFrame() 17 | label = [] 18 | 19 | for f in tqdm(filelist): 20 | f_a1a2 = os.path.join(args.features_path+"\\features_quant",f) 21 | f_a3 = os.path.join(args.features_path+"\\features_impwords",f) 22 | f_a4 = os.path.join(args.features_path+"\\features_wordemb",f) 23 | f_a5 = os.path.join(args.features_path+"\\features_pos",f) 24 | f_a6 = os.path.join(args.features_path+"\\features_openword",f) 25 | f_sent = os.path.join(args.features_path+"\\sent-pos",f) 26 | 27 | 28 | df_a1a2 = df_a1a2.append(pd.read_table(f_a1a2,delimiter=' ',header=None)) 29 | df_a3 = df_a3.append(pd.read_table(f_a3,delimiter=' ',header=None)) 30 | df_a4 = df_a4.append(pd.read_table(f_a4,delimiter=' ',header=None)) 31 | df_a5 = df_a5.append(pd.read_table(f_a5,delimiter=' ',header=None)) 32 | df_a6 = df_a6.append(pd.read_table(f_a6,delimiter=' ',header=None)) 33 | 34 | 35 | fr2 = open(f_sent,"r") 36 | for sentence in fr2.readlines(): 37 | sentence = sentence.rstrip("\n") 38 | ls = sentence.split("$$$") 39 | lab = ls[1].rstrip("\n").lstrip(" ").rstrip(" ") 40 | label.append(int(lab)) 41 | fr2.close() 42 | 43 | label = pd.Series(label) 44 | 45 | df_features = pd.concat([df_a1a2,df_a3,df_a4,df_a5,df_a6],axis=1) 46 | sc = StandardScaler() 47 | df_features = sc.fit_transform(df_features) 48 | 49 | 50 | print("TRAINING STARTS") 51 | 52 | 53 | 54 | lgbm = LGBMClassifier( 55 | objective='binary', 56 | boosting='gbdt', 57 | learning_rate = args.lr, #0.001, 58 | #max_depth = 8, 59 | num_leaves = args.numleaves,#1043, 60 | n_estimators = args.n_est,#400, 61 | bagging_fraction = 0.8, 62 | feature_fraction = 0.9) 63 | #reg_alpha = 0.2, 64 | #reg_lambda = 0.4)) 65 | 66 | mod = lgbm.fit(df_features, label) 67 | joblib.dump(mod,args.model_path+'summary_model.pkl') 68 | 69 | 70 | 71 | 72 | #train_data=lgb.Dataset(df_features,label=label) 73 | 74 | #param = {'num_leaves':1043, 'objective':'binary','learning_rate':0.001, 'boost_from_average':False} 75 | #param['metric'] = ['auc', 'binary_logloss'] 76 | 77 | #training our model using light gbm 78 | #num_round=50 79 | 80 | #lgbm=lgb.train(param,train_data,num_round) 81 | #joblib.dump(lgbm,args.model_path+'summary_model.pkl') 82 | -------------------------------------------------------------------------------- /extractive/Gist/codes/word2vec.py: -------------------------------------------------------------------------------- 1 | import os 2 | from nltk import word_tokenize 3 | from gensim.models import Word2Vec 4 | import numpy as np 5 | import string 6 | from tqdm import tqdm 7 | 8 | def train_word2vec(train_data_path,save_model_path): 9 | sentences = [] 10 | print("reading files for word2vec") 11 | for file in tqdm(os.listdir(train_data_path)): 12 | fr = open(os.path.join(train_data_path,file),"r") 13 | for line in fr.readlines(): 14 | line = line.rstrip("\n") 15 | line = (line.split("$$$")[0]).lstrip(" ").rstrip(" ") 16 | line = line.translate(str.maketrans('', '', string.punctuation)) 17 | sentences.append(word_tokenize(line)) 18 | 19 | 20 | 21 | print ("starting word2vec training") 22 | model = Word2Vec(sentences, min_count=5) 23 | model.save(save_model_path+"word2vec_model.bin") 24 | # fw = open(save_model_path+"word-embs.txt","w") 25 | # for w in list(model.wv.vocab): 26 | # fw.write(w+" ") 27 | # vec = model.wv[w] 28 | # fw.write(" ".join(map(str, vec))+"\n") 29 | 30 | # fw.close() 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /extractive/Gist/codes/word_emb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 7 23:11:24 2020 4 | 5 | @author: Paheli 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | """ 10 | Created on Tue Apr 7 16:44:29 2020 11 | 12 | @author: Paheli 13 | """ 14 | from gensim.models import Word2Vec 15 | import numpy as np 16 | from nltk import word_tokenize 17 | from tqdm import tqdm 18 | import os 19 | 20 | def run_code(model_path,data_folder,features_folder): 21 | 22 | model = Word2Vec.load(model_path) 23 | if not os.path.exists(features_folder): 24 | os.mkdir(features_folder) 25 | 26 | for f in tqdm(os.listdir(data_folder)): 27 | file = os.path.join(data_folder,f) 28 | fr = open(file,"r") 29 | fw = open(os.path.join(features_folder,f),"w") 30 | for sentence in fr.readlines(): 31 | sentence = sentence.rstrip("\n") 32 | if "$$$" in sentence: 33 | sls = sentence.split("$$$") 34 | sentence = sls[0] 35 | summation = np.zeros(100) 36 | words = word_tokenize(sentence) 37 | length = 0 38 | for word in sentence: 39 | if word in model.wv.vocab: 40 | vec = model.wv[word] 41 | summation = np.add(summation,vec) 42 | length+=1 43 | 44 | if length > 0 : summation = summation/length 45 | summation = np.around(summation,decimals=4) 46 | fw.write(" ".join(map(str, summation))+"\n") 47 | 48 | fw.close() 49 | 50 | -------------------------------------------------------------------------------- /extractive/Gist/cue_phrases.txt: -------------------------------------------------------------------------------- 1 | Question for consideration is 2 | Questions for consideration are 3 | Points for consideration are 4 | Point for consideration is 5 | Question that arise for consideration is 6 | Questions that arise for consideration are 7 | Points that arise for consideration are 8 | Point that arise for consideration is 9 | Question before us is 10 | Questions before us are 11 | Points before us are 12 | Point before us is 13 | Question that arise before us is 14 | Questions that arise before us are 15 | Points that arise before us are 16 | Point that arise before us is 17 | find any reasons to interfere with 18 | find anything to interfere with 19 | we are not in agreement with 20 | we do not agree with 21 | Order under challenge in tha 22 | this order is under challenge 23 | Relevant facts in this case 24 | Facts of the case 25 | Facts of this case 26 | In the fact of this evidence 27 | On the basis of 28 | Court 29 | lower court 30 | appellate court 31 | authority 32 | We find that 33 | We did not find that 34 | We found that 35 | This court finds that 36 | This court found that 37 | This court did not find that 38 | It was found that 39 | We agree with 40 | in our view 41 | Petitioner 42 | Respondent 43 | Appellant 44 | plaintiff 45 | Act 46 | Constitution 47 | section 48 | sections 49 | article 50 | articles 51 | v 52 | vs 53 | Vs 54 | contending 55 | contend 56 | parties 57 | No provision in 58 | We hold 59 | valid 60 | legally valid 61 | legally not valid 62 | We agree with 63 | We are of the view that 64 | Statute 65 | In view 66 | According 67 | We are also of view 68 | petition 69 | Appeal 70 | Review 71 | Revision 72 | allowed 73 | dismissed 74 | upheld 75 | order of the court 76 | Dismiss 77 | dismissed 78 | dismissing sustained 79 | rejected 80 | Holding 81 | Tribunal 82 | suit 83 | trial court 84 | decree 85 | defendant 86 | provisions 87 | provision 88 | appeal 89 | appeals 90 | issue 91 | issue 92 | ratio 93 | fact 94 | facts 95 | 96 | -------------------------------------------------------------------------------- /extractive/Gist/generate_features.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jun 5 17:34:18 2021 4 | 5 | @author: Paheli 6 | """ 7 | import argparse 8 | from codes import generate_features 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--data_path', default = 'data/text/', type = str, help = 'Folder containing textual data') 13 | parser.add_argument('--features_path', default = 'data/features/', type = str, help = 'Folder to store features of the textual data') 14 | parser.add_argument('--important_words', default = 'data/important_words.txt', type = str, help = 'Text file containing important/domain specific words') 15 | parser.add_argument('--pos_tags', default = 'data/postags.txt', type = str, help = 'Text file containing relevant pos tags for the task') 16 | parser.add_argument('--w2v', default = False, type = bool, help = 'Whether to train a word2vec model (on the train data)') 17 | parser.add_argument('--word2vec_path', default = 'data/word2vec_model.bin', type = str, help = 'Path where pretrained word2vec is present') 18 | 19 | 20 | args = parser.parse_args() 21 | 22 | print('Generating features for data in ...'+args.data_path, end = ' ') 23 | generate_features.features(args) 24 | 25 | 26 | if __name__ == '__main__': 27 | main() 28 | 29 | -------------------------------------------------------------------------------- /extractive/Gist/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/Gist/infer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jun 5 18:11:53 2021 4 | 5 | @author: Paheli 6 | """ 7 | # -*- coding: utf-8 -*- 8 | """ 9 | Created on Sat Jun 5 17:34:18 2021 10 | 11 | @author: Paheli 12 | """ 13 | import argparse 14 | from codes import generate_summary 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--data_path', default = 'data/text/', type = str, help = 'Folder containing textual data') 19 | parser.add_argument('--features_path', default = 'data/features/', type = str, help = 'Folder to store features of the textual data') 20 | parser.add_argument('--summary_path', default = 'data/summaries/', type = str, help = 'Folder to store features of the textual data') 21 | 22 | parser.add_argument('--model_path', default = 'data/summary_model.pkl', type = str, help = 'Path where trained summarization model is present') 23 | parser.add_argument('--length_file', default = 'data/length.txt', type = str, help = 'Path to file containing summary length') 24 | 25 | args = parser.parse_args() 26 | 27 | 28 | generate_summary.summary(args) 29 | 30 | 31 | if __name__ == '__main__': 32 | main() 33 | 34 | -------------------------------------------------------------------------------- /extractive/Gist/length.txt: -------------------------------------------------------------------------------- 1 | 78.txt 50 2 | 232.txt 80 3 | 266.txt 40 -------------------------------------------------------------------------------- /extractive/Gist/postags.txt: -------------------------------------------------------------------------------- 1 | CC 2 | CD 3 | DT 4 | EX 5 | FW 6 | IN 7 | JJ 8 | JJR 9 | JJS 10 | LS 11 | MD 12 | NN 13 | NNS 14 | NNP 15 | NNPS 16 | PDT 17 | POS 18 | PRP 19 | PRP$ 20 | RB 21 | RBR 22 | RBS 23 | RP 24 | TO 25 | UH 26 | VB 27 | VBD 28 | VBG 29 | VBN 30 | VBP 31 | VBZ 32 | WDT 33 | WP 34 | WP$ 35 | WRB -------------------------------------------------------------------------------- /extractive/Gist/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from codes import train_classifier 3 | 4 | def main(): 5 | parser = argparse.ArgumentParser() 6 | 7 | parser.add_argument('--data_path', default = 'data/text/', type = str, help = 'Folder containing textual data') 8 | parser.add_argument('--features_path', default = 'features/', type = str, help = 'Folder to store features of the textual data') 9 | parser.add_argument('--model_path', default = 'features/', type = str, help = 'Path where trained summarization model is present') 10 | parser.add_argument('--lr', default = 0.001, type = float, help = 'learning rate of the model') 11 | parser.add_argument('--numleaves', default = 1043, type = int, help = 'no. of leaves') 12 | parser.add_argument('--n_est', default = 1043, type = int, help = 'no. of estimators') 13 | 14 | 15 | args = parser.parse_args() 16 | 17 | train_classifier.train_lgbm(args) 18 | 19 | 20 | if __name__ == '__main__': 21 | main() 22 | 23 | -------------------------------------------------------------------------------- /extractive/GraphicalModel/README.md: -------------------------------------------------------------------------------- 1 | # GraphicalModel 2 | 3 | Implementation of the paper [Improving Legal Document Summarization Using Graphical Models](https://www.cse.iitm.ac.in/~ravi/papers/Saravanan_jurix_06.pdf), JURIX 2006 4 | 5 | ## Execution 6 | 7 | ```py 8 | $ python 9 | >>> import graphicalModel 10 | >>> summary = graphicalModel.get_summary('sample input.txt') 11 | ``` 12 | 13 | The following code assumes the reader has some basic understanding of the algorithm beforehand. The above instruction is the only step needed to extract tehe summary, the following is just an explanation of [graphicalModel.py](graphicalModel.py) 14 | 15 | We use ROUGE as a metric of summary evaluation. The top level file here is [graphicalModel.py](graphicalModel.py), which loops over all the testing documents, generates a summary via Graphical Model (CRF - Conditional Random Field), compares it with the manually annotated summary provided by WestLaw, computed and reports the ROUGE score for each document. 16 | 17 | The sample output is available at [`results_crf.txt`](results_crf.txt). The process fo generating summary for a legal document involves the following steps: 18 | 19 | * Training the CRF (needs to be trained just once, for all test documents) 20 | * Getting predicted categories for each sentence 21 | * Rank sentences on importance based on K-mixture model 22 | * Return summary based on max length of summary allowed 23 | 24 | ## Training the CRF 25 | 26 | Saravnan et. al defined their own thematic roles / sections different than the one we followed, thus the first step was creating an equivalent mapping, which was as follows: 27 | 28 | | Graphical Model category | Equivalent FIRE category | 29 | | --- | --- | 30 | | Identifying facts | Facts | 31 | | Establishing facts | Issue | 32 | | Arguing | Precedent | 33 | | Arguments | Arguments | 34 | | History | Ruling by lower court | 35 | | Arguments | Other general standards | 36 | | Ratio | Statute | 37 | | Final Decision | Ruling by present court | 38 | 39 | The features we use for training our CRF are: 40 | * The word, in lower case 41 | * Last 3 characters of words (to detect suffixes like -ion, -ing) 42 | * Last 2 characters (suffixes, like -ed) 43 | * Whether word w is all capital (abbreviation) 44 | * Whether w is a string or number 45 | * Position of w within sentence 46 | * Position of paragraph in the document 47 | * POS tag 48 | * Presence of aforementioned cues 49 | * Beginning / End of sentence ? 50 | * The words surrounding w, i.e, the previous and the next word 51 | 52 | This is done via [crf_train.py](crf_train.py), which runs the training algorithm for all annotated documents in `annotated` and `annotated_json` folder. The learned model is then saved in [`crf_alltrain.model`](crf_alltrain.model) 53 | 54 | ## Predicting categories / classes 55 | 56 | Once the CRF model is trained, for a test document we obtain the features as well as the classification via [`crf_test.py`](crf_test.py). 57 | In the current state, the test documents are HTML files, as in [`input_sample_test.html`](input_sample_test.html). The sccript handles HTML parsing and tokenization as well. The output of this script is the tokenized text, features, and category predictions for each statement. 58 | 59 | ## Rank sentences 60 | 61 | After obtaining the predicted classes for a test document, we need to rank the sentences on basis of importance. We use K-mixture model to do this, the code for which is available as [`k_mix_model_test.py`](k_mix_model_test.py). The returned value is a mapping of each sentence and its score based on the heuristic function, which utilizes TF-IDF: 62 | ``` 63 | K mixture model: Probability of word wi appearing k times is given as : 64 | Pi(k) = (1-r) δk, 0 + (r / s+1) * (s / s+1)^k 65 | 66 | δk, 0 = 1 iff k = 0, else 0 67 | r = observed mean s = observed IDF 68 | ``` 69 | 70 | ## Generating the summary 71 | 72 | We rank the sentences based on their KMM score (obtained above), and keep adding sentences in a decreasing order of their scores, as long as the template length of summary is not filled. Have a look at [`graphicalModel.py`](graphicalModel.py)'s `get_summary` function to have a look how this is implemented. 73 | -------------------------------------------------------------------------------- /extractive/GraphicalModel/crf_alltrain.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Law-AI/summarization/77edff66005cca4c897e608db6d47509c9e8d029/extractive/GraphicalModel/crf_alltrain.model -------------------------------------------------------------------------------- /extractive/GraphicalModel/crf_test.py: -------------------------------------------------------------------------------- 1 | ### 2 | ### For a given file, return the classification for each 3 | ### 4 | import crf_train 5 | import html2text 6 | import pycrfsuite 7 | import nltk 8 | import sys 9 | # import numpy as np 10 | import operator 11 | # from sklearn.metrics import classification_report 12 | 13 | 14 | def parse_html(file): 15 | ''' 16 | Return usable case content, as list 17 | ''' 18 | with open(file, 'r') as f: 19 | txt = f.read() 20 | 21 | txt=(txt.replace('\"\']|\"[^\"]*\"|\'[^\']*\')*>','')) 22 | t = html2text.html2text(txt) 23 | 24 | tokenized = nltk.tokenize.sent_tokenize(t) 25 | start = 0 26 | 27 | # The Judgment was delivered by / 'for educational use only' 28 | while start < len(tokenized) and 'the judgment was delivered by' not in tokenized[start].lower() : 29 | start = start + 1 30 | 31 | if start == len(tokenized): 32 | start = 0 33 | while 'for educational use only' not in tokenized[start].lower(): 34 | start = start + 1 35 | 36 | text, indices = [], [] 37 | for i in range(start+1,len(tokenized)): 38 | if 'thomson reuters south asia private limited' in tokenized[i].lower(): 39 | break 40 | text.append(tokenized[i].replace('\n',' ')) 41 | indices.append(i-start) 42 | 43 | return text, indices 44 | 45 | 46 | def test_crf(file, tagbot = None): 47 | ''' 48 | Use pre trained model (crf_alltrain) to classify file 49 | ''' 50 | text, indices = parse_html(file) 51 | length = len(indices) 52 | test_sentences = [] 53 | for line in text: 54 | tokens = nltk.tokenize.word_tokenize(line) 55 | labels = [(token, '-') for token in tokens] 56 | 57 | test_sentences.append(labels) 58 | 59 | test_postag = crf_train.add_postag(test_sentences) 60 | X_test = [crf_train.sentenceToFeatures(test_postag[i], indices[i]*1.0/length) for i in range(len(test_postag))] 61 | # test is not known at all 62 | # Y_test = [crf_train.sentenceToLabels(sentence) for sentence in test_postag] 63 | 64 | tagger = tagbot 65 | if tagger == None: 66 | tagger = pycrfsuite.Tagger() 67 | tagger.open('crf_alltrain.model') 68 | Y_pred = [tagger.tag(xseq) for xseq in X_test] 69 | 70 | # labels = {'-': 0, '?': 0} 71 | # counter = 1 72 | # for category in crf_train.cue_phrases: 73 | # labels[category] = counter 74 | # counter += 1 75 | 76 | # predictions = [labels[tag] for row in Y_pred for tag in row] 77 | ## Test is not known 78 | # # truths = [labels[tag] for row in Y_test for tag in row] 79 | # c = 0 80 | # for p, t in zip(predictions, truths): 81 | # if p+t!=0: 82 | # print(p, t) 83 | # c+=1 84 | # # predictions = np.array([labels[tag] for row in Y_pred for tag in row]) 85 | # # truths = np.array([labels[tag] for row in Y_test for tag in row]) 86 | # # # print(len(predictions)) 87 | # # # print(len(truths)) 88 | # # print(c, 'identified labels') 89 | # # print(predictions) 90 | # print( classification_report(truths, predictions,target_names=list(crf_train.cue_phrases.keys()) ) ) 91 | 92 | return text, X_test, Y_pred 93 | 94 | 95 | 96 | if __name__ == '__main__': 97 | ''' 98 | Nothing complex, call the MVP 99 | ''' 100 | if len(sys.argv) > 1: 101 | file = sys.argv[1] 102 | else: 103 | file = input('Provide file name to classify: ') 104 | test_crf(file) -------------------------------------------------------------------------------- /extractive/GraphicalModel/extract-categories.py: -------------------------------------------------------------------------------- 1 | # python3 2 | import os 3 | import re 4 | import json 5 | 6 | ANNOTATED_FOLDER = 'annotated' 7 | CATEGORIES_FOLDER = 'categories' 8 | 9 | category_abbr = { 10 | "Argument": "A", "Fact": "F", "Issue": "I", 11 | "Ruling by lower court": "LR", "Ruling by the present court": "R", 12 | "Statute": "SS", "Precedent": "SP", 13 | "Other general standards including customary equitable and other extra-legal considerations": "SO", 14 | } 15 | 16 | 17 | def extract_loop(): 18 | ''' 19 | run a loop to get all sentences 20 | categories - 21 | ''' 22 | stmts = {} 23 | for file in [doc for doc in os.listdir(ANNOTATED_FOLDER) if doc.split('.')[-1] == 'txt'] 24 | # print('Working on ', file) 25 | with open(ANNOTATED_FOLDER + '/' + file, 'r') as f: 26 | txt = f.readlines() 27 | txt = [i for i in txt if i!='\n'] 28 | start = txt.index('\n') 29 | 30 | text = '\n'.join(txt[start+1:len(txt)-1]) 31 | t2 = re.split("S[0-9]+ ", text) 32 | t2 = list(filter(None, t2)) # remove any empty strings 33 | for line in t2: 34 | category = line.split(' ')[0] 35 | if category == 'FE' or category == 'FI': 36 | category = 'F' 37 | if category not in stmts: 38 | stmts[category] = [] 39 | print(category,' first showed up') 40 | stmts[category].append(' '.join(line.split(' ')[1:])) # remove category 41 | 42 | # JSONs 43 | for file in [doc for doc in os.listdir(ANNOTATED_FOLDER) if doc.split('.')[-1] == 'json'] 44 | with open(ANNOTATED_FOLDER + '/' + file, 'r') as f: 45 | txt = f.readlines() 46 | tj = json.loads(''.join(txt)) 47 | 48 | for tagging in tj['annotation']: 49 | category = category_abbr[ tagging['label'][0] ] 50 | sentences = list(filter(None, tagging['points'][0]['text'].split('\n'))) 51 | if category not in stmts: 52 | stmts[category] = [] 53 | print(category,' first showed up') 54 | 55 | stmts[category].extend(sentences) 56 | 57 | 58 | print('Categories detected: ', ' '.join(stmts.keys())) 59 | 60 | print('Writing into files now') 61 | for category in stmts: 62 | categorized = stmts[category] 63 | categorized = [ i.rstrip() for i in categorized] 64 | categorytext = '\n'.join(categorized) 65 | with open(CATEGORIES_FOLDER + '/' + category + '.txt', 'w') as f: 66 | f.write(categorytext) 67 | print(category + ' done') 68 | 69 | if __name__ == '__main__': 70 | extract_loop() -------------------------------------------------------------------------------- /extractive/GraphicalModel/formulated_constants.py: -------------------------------------------------------------------------------- 1 | categorical_phrases = dict() 2 | categorical_pairs = dict() 3 | # https://www.isical.ac.in/~fire/2013/legal.html 4 | # Cannot unfortunately extract pairs automatically 5 | 6 | ## Fact 7 | # Months 8 | # 'the appellant' excluded because appellant already covered 9 | categorical_phrases['F'] = [ 10 | 'appellant','dated', 'No.', 'State', 'filed', 'Government','registration', 'basis', 11 | 'stated', 'notice', 'period', 'region', 12 | 'January', 'February', 'March','April','May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 13 | 14 | 'the plaintiff', 'the basis', 'basis of', 'Court of', 'stated that', 'filed a', 'was issued', 'the Constitution', 15 | 'the Corporation', 'under Article','plaintiff and', 'show cause', 16 | 17 | 'High court of', 'that the said', 'referred to as', 'stated that the', 'In the year', 18 | ] 19 | 20 | categorical_pairs['F'] = { 21 | } 22 | 23 | ## Issue 24 | 25 | categorical_phrases['I'] = [ 26 | 'appeal by', 27 | 28 | ] 29 | 30 | categorical_pairs['I'] = { 31 | } 32 | 33 | ## Argument 34 | 35 | categorical_phrases['A'] = [ 36 | 'service', 'amount', 'value','section', 'goods','credit','license','supplied', 37 | 'recipient', 'country', 'quota', 'contended','contention', 38 | 39 | 'under Section', 'terms of', 'counsel for', 'before us', 'charged by', 'is charged', 'free of', 40 | 41 | 'charged by the', 'provided by the', 42 | 43 | ] 44 | 45 | categorical_pairs['A'] = { 46 | } 47 | 48 | ## Ruling by lower court 49 | 50 | categorical_phrases['LR'] = [ 51 | 'Tribunal', 'defendants', 'Judge', 'Bench','Division', 'trial', 52 | 53 | 'held that', 'trial court', 'the parent', 54 | 55 | 'that the defendants', 56 | ] 57 | 58 | categorical_pairs['LR'] = { 59 | } 60 | 61 | ## Statute 62 | 63 | categorical_phrases['SS'] = [ 64 | 'provisions', 'provision', 'act', 'section', 'chapter', '(', 'explanation', 'commission', 'agent', 'estates', 65 | 66 | 'of service', 'subject to', 'deemed to', 'or tenure', 67 | 68 | 'as may be', 'the estates of', 69 | 70 | 'the purpose of this', 'in a case where', 'as may be prescribed', 71 | ] 72 | 73 | categorical_pairs['SS'] = { 74 | } 75 | 76 | ## Precedent 77 | 78 | categorical_phrases['SP'] = [ 79 | 'v', 'Vs', 'vs', 'fact', 'SC', '&', 'conclusion', 'finding', 'SCC', 'S.C.R.', 'SCR', 80 | 'carrying', 'observed', 81 | 82 | 'this court', 'of law', 'question of', 'laid down', 'the question', 'was whether', 'decision was', 83 | 'evidence to', 'this Court', 'a finding', 84 | 85 | 'it was held', 'question of fact', 86 | ] 87 | 88 | categorical_pairs['SP'] = { 89 | } 90 | 91 | ## Other general standards 92 | years = [str(x) for x in list(range(1900, 2100))] 93 | categorical_phrases['SO'] = [ 94 | 'Civil', 'Appeal', 95 | 96 | 'Appeal No.', 97 | ] 98 | 99 | categorical_phrases['SO'].extend(years) 100 | categorical_pairs['SO'] = { 101 | } 102 | 103 | ## Ruling by the present court 104 | 105 | categorical_phrases['R'] = [ 106 | 'Dismiss', 'dismissed', 'dismissing', 'sustained', 'rejected', 'allowed','passed','case','considered','aside', 107 | 'preliminary','judgement','accordingly', 'amended', 'allow','owed', 'decree', 108 | 109 | 'the case', 'the Amending', 'set aside', 'order to',' appeal is', 'the Section', 'execution of', 'Appeal allowed', 110 | 111 | 'the part of', 'to the suit', 'of the Court', 112 | 113 | ] 114 | 115 | categorical_pairs['R'] = { 116 | } 117 | 118 | 119 | def include_toggled_list(l): 120 | ''' 121 | If list has summary, it should also have Summary and summary. 122 | ''' 123 | l_ = list() 124 | for item in l: 125 | l_.append(item + '.') 126 | l += l_ 127 | l_ = list() 128 | 129 | for item in l: 130 | first_toggled = item[0].lower() if item[0].isupper() else item[0].upper() 131 | l_.append(first_toggled + item[1:]) 132 | l += l_ 133 | l = list(set(l)) 134 | 135 | 136 | def include_toggled_dict(d): 137 | ''' 138 | If dict has summary: x, it should also have Summary: x 139 | ''' 140 | d_ = dict() 141 | for key in d: 142 | first_toggled = key[0].lower() if key[0].isupper() else key[0].upper() 143 | d_[first_toggled + key[1:]] = d[key] 144 | # d = {**d, **d_} 145 | d.update(d_) 146 | 147 | 148 | def include_toggled_phrases(phrases): 149 | ''' 150 | Iterator for each category 151 | ''' 152 | for category in phrases: 153 | include_toggled_list(phrases[category]) 154 | 155 | 156 | def include_toggled_pairs(pairs): 157 | ''' 158 | Iterator for each categorical pair 159 | ''' 160 | total_pairs, c = sum([ len(pairs[category]) for category in pairs ]), 0 161 | for category in pairs: 162 | # 'If': ['so be'], 163 | # to be extended to 164 | # 'If': ['so be', 'So be'], 'if': ['so be', 'So be'] 165 | 166 | if category == 'arguing': 167 | # print('Passing', category) 168 | for key in pairs[category]: 169 | pairs[category][key] = list( set( pairs[category][key] ) ) 170 | pass 171 | # print('Not passing', category) 172 | for key in pairs[category]: 173 | # print ( pairs[category][key] ) 174 | include_toggled_list( pairs[category][key] ) 175 | c += 1 176 | # print(key, " => ", c, "/", total_pairs) 177 | 178 | c = 0 179 | for category in pairs: 180 | d2 = {} 181 | for key in pairs[category]: 182 | k2 = key[0].upper() if key[0].islower() else key[0].lower() 183 | d2[k2 + key[1:]] = pairs[category][key] 184 | c += 1 185 | # print(key, " => ", c, "/", total_pairs) 186 | pairs[category].update(d2) 187 | 188 | # make a set 189 | for category in pairs: 190 | for key in pairs[category]: 191 | pairs[category][key] = list( set( pairs[category][key] ) ) -------------------------------------------------------------------------------- /extractive/GraphicalModel/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/GraphicalModel/k_mix_model_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import math 4 | import html2text 5 | import operator 6 | import itertools 7 | import operator 8 | import nltk 9 | from nltk import tokenize 10 | from nltk.corpus import stopwords, wordnet as wn 11 | import crf_test 12 | 13 | # 14 | # python k-mix-model-test.py ../FullText_html/2010/2010_A_1.html 15 | # Pg 139 / 207 in thesis 16 | # 17 | 18 | stopwords = set(stopwords.words('english')) 19 | # intentionally skipping out quotes here 20 | punctuation = '!#$%&()*+,-./:;<=>?@[\\]^_`{|}~' 21 | for each in punctuation: 22 | stopwords.add(each) 23 | 24 | 25 | def accumulate(l): 26 | ''' 27 | Group By Key 28 | ''' 29 | it = itertools.groupby(l, operator.itemgetter(0)) 30 | for key, subiter in it: 31 | yield key, sum(item[1] for item in subiter) 32 | 33 | 34 | # Don't process tags, just strip them out. Use an indent of 4 spaces 35 | # and a page that's 80 characters wide. 36 | 37 | # 1st Tokenize and form sentence 38 | def tokenizeSentence(fileContent): 39 | """ 40 | :param fileContent: fileContent is the file content which needs to be summarized 41 | :return: Returns a list of string(sentences) 42 | """ 43 | return tokenize.sent_tokenize(fileContent) 44 | 45 | 46 | # 2nd Case Folding 47 | def caseFolding(line): 48 | """ 49 | :param line is the input on which case folding needs to be done 50 | :return: Line with all characters in lower case 51 | """ 52 | return line.lower() 53 | 54 | 55 | # 3rd Tokenize and form tokens from sentence 56 | def tokenizeLine(sentence): 57 | """ 58 | :param sentence: Sentence is the english sentence of the file 59 | :return: List of tokens 60 | """ 61 | return tokenize.word_tokenize(sentence) 62 | 63 | 64 | # 4th Stop Word Removal 65 | def stopWordRemove(tokens): 66 | """ 67 | :param tokens: List of Tokens 68 | :return: List of tokens after removing stop words 69 | """ 70 | # list_tokens = [] 71 | # for token in tokens: 72 | # if token not in stopwords: 73 | # list_tokens.append(token) 74 | # return list_tokens 75 | list_tokens = [token for token in tokens if token not in stopwords] 76 | return list_tokens 77 | 78 | 79 | def KMM(eval_file): 80 | 81 | test_sentences, _ = crf_test.parse_html(eval_file) 82 | t2 = [caseFolding(sentence) for sentence in test_sentences] 83 | test_sentences = t2 84 | 85 | index, docId = {}, 1 86 | 87 | for sentence in test_sentences: 88 | sentence = sentence.strip() 89 | if len(sentence) == 0: 90 | continue 91 | 92 | tokens = nltk.tokenize.word_tokenize(sentence) 93 | final_tokens = stopWordRemove(tokens) 94 | for token in final_tokens: 95 | if token not in index: 96 | index[token] = [ (docId, 1) ] 97 | else: 98 | last_entry = index[token][-1] 99 | if last_entry[0] == docId: 100 | index[token][-1] = (docId, last_entry[1]+1) 101 | else: 102 | index[token].append( (docId, 1) ) 103 | 104 | docId = docId + 1 105 | # for key in index: 106 | # getList=index[key] 107 | # index[key]=list(accumulate(getList)) 108 | 109 | N, cf, tf, df = 360, 0, 0, 0 110 | num= math.log(N)/math.log(2) 111 | kmixresult={} 112 | sent_id=1 113 | 114 | for sentence in test_sentences: 115 | pk=0 116 | myline=tokenizeLine(sentence) 117 | final_tokens=stopWordRemove(myline) 118 | 119 | for token in final_tokens: 120 | if token not in index: 121 | continue 122 | tokenlist = index[token] 123 | cf = 0 124 | for x,y in tokenlist: 125 | if x == 1: 126 | tf = y 127 | df=len(tokenlist) 128 | 129 | cf = cf + y 130 | #print 'token='+token 131 | t= cf * 1.0 / N 132 | #print 't='+str(t) 133 | idf = num * 1.0 / df if df else 0 134 | #print 'idf='+str(idf) 135 | s = ( ( (cf-df) * 1.0) / df ) if df else 0 136 | #print 's='+str(s) 137 | r=1 138 | r = t/s if s else 0 139 | #print 'r='+str(r) 140 | pk = (r / (s+1) ) * (s / (s+1) ) + math.pow( (s / (s+1) ), tf) 141 | 142 | #print sentence 143 | #print pk 144 | kmixresult[sent_id] = pk 145 | sent_id = sent_id + 1 146 | # print(sentence, kmixresult[sent_id-1]) 147 | 148 | # for sentence in test_sentences: 149 | # print(sentence) 150 | 151 | # print(len(test_sentences)) 152 | # print(kmixresult) 153 | 154 | # kmix_sorted = sorted(kmixresult.items(), key=operator.itemgetter(1),reverse=True) 155 | # for key, value in kmix_sorted: 156 | # print(test_sentences[key-1]) 157 | 158 | return kmixresult 159 | 160 | 161 | if __name__ == '__main__': 162 | if len(sys.argv) > 1: 163 | file = sys.argv[1] 164 | else: 165 | file = input('Input file to compute scores for: ') 166 | 167 | KMM(file) -------------------------------------------------------------------------------- /extractive/GraphicalModel/k_mix_output.txt: -------------------------------------------------------------------------------- 1 | 28 38178520590013.51 2 | 82 7881645488754.992 3 | 101 33575773026.61672 4 | 22 1364470.2612857409 5 | 54 72495.7513888907 6 | 108 43626.25160667495 7 | 102 29825.09064748152 8 | 31 22892.134259259423 9 | 43 22892.134259259423 10 | 75 22892.134259259423 11 | 63 3788.987901833185 12 | 65 997.7629629629637 13 | 80 898.7824074074077 14 | 109 647.6126543209858 15 | 83 607.6975349433403 16 | 88 25.40546790173725 17 | 72 22.837191358024697 18 | 61 22.556018518518613 19 | 113 11.727969348658974 20 | 95 10.322311024562635 21 | 7 10.185326254232073 22 | 21 5.3516590024585176 23 | 67 5.3516590024585176 24 | 66 4.360383345537698 25 | 99 4.142083258445258 26 | 119 3.8282828282828305 27 | 107 3.784522803294937 28 | 116 3.636534839924669 29 | 73 3.349537037037038 30 | 48 3.3184104052444177 31 | 52 3.30304609876581 32 | 57 3.254129263116958 33 | 44 3.1158968880122733 34 | 118 3.087994034302757 35 | 37 3.034656814425525 36 | 15 2.787560079163129 37 | 86 2.559601742460641 38 | 87 2.559601742460641 39 | 85 2.183842592592593 40 | 55 1.886114937251301 41 | 13 1.7900508351488742 42 | 2 1.708509142053446 43 | 24 1.7067547055251968 44 | 5 1.6447378071140446 45 | 71 1.6053839218632604 46 | 19 1.5865484429065748 47 | 33 1.5583498677248677 48 | 115 1.5264147396127088 49 | 25 1.506456182958915 50 | 30 1.4200867052023125 51 | 117 1.3602677189724544 52 | 35 1.3319502074688796 53 | 41 1.2317673378076062 54 | 53 1.1426746306939062 55 | 100 1.1299515121630506 56 | 46 1.126531165311653 57 | 89 1.1068399452804376 58 | 34 1.0748245221027481 59 | 64 1.065339723526343 60 | 12 1.0566274319459612 61 | 90 1.0566274319459612 62 | 50 1.0496062992125987 63 | 40 1.025999243207236 64 | 94 1.0007581018518519 65 | 112 0.9680157946692991 66 | 26 0.9399547661981307 67 | 84 0.9399547661981307 68 | 8 0.8745990836197023 69 | 6 0.8711131862512143 70 | 32 0.8671118149451424 71 | 49 0.8671118149451424 72 | 74 0.8287825324668168 73 | 36 0.8269119181501733 74 | 47 0.8125956535047445 75 | 76 0.8116279069767441 76 | 51 0.796712744713269 77 | 58 0.7880797785666196 78 | 4 0.7046274333689918 79 | 91 0.7046274333689918 80 | 120 0.7046274333689918 81 | 16 0.636318407960199 82 | 1 0.6254374882950248 83 | 14 0.6235031663788141 84 | 23 0.48043987891327417 85 | 29 0.48043987891327417 86 | 9 0.2658674726287928 87 | 11 0.2658674726287928 88 | 17 0.2658674726287928 89 | 18 0.2658674726287928 90 | 20 0.2658674726287928 91 | 45 0.2658674726287928 92 | 56 0.2658674726287928 93 | 59 0.2658674726287928 94 | 68 0.2658674726287928 95 | 70 0.2658674726287928 96 | 77 0.2658674726287928 97 | 78 0.2658674726287928 98 | 81 0.2658674726287928 99 | 92 0.2658674726287928 100 | 97 0.2658674726287928 101 | 104 0.2658674726287928 102 | 106 0.2658674726287928 103 | 114 0.2658674726287928 104 | 93 0 105 | 111 -0.9504830917874396 106 | 3 -9.399999999999793 107 | 96 -9.399999999999793 108 | 38 -43.83194444444445 109 | 42 -88.66388888888876 110 | 103 -102.86783284742458 111 | 10 -326.65568186028565 112 | 60 -496.4928232689209 113 | 69 -23102081326.034996 114 | 79 -4.247436601304027e+18 115 | 98 -2.909398467025563e+33 116 | 105 -2.6681765651173983e+36 117 | 110 -1.289125498056627e+52 118 | 62 -2.1985642013044388e+70 119 | 27 -2.5956783858709018e+116 120 | 39 -1.6232370264991055e+162 121 | -------------------------------------------------------------------------------- /extractive/LetSum/README.md: -------------------------------------------------------------------------------- 1 | # LetSum 2 | 3 | Implementation of the paper [LetSum, a Text Summarization system in Law field](http://rali.iro.umontreal.ca/rali/?q=en/node/673), JURIX 2004 4 | 5 | ## Execution 6 | 7 | ``` 8 | $ python 9 | >>> import letsum 10 | >>> import letsum_test 11 | >>> summary = letsum_test.LetSum('sample input.txt') 12 | ``` 13 | 14 | We use ROUGE as a metric of summary evaluation. The top level file here is [letsum.py](letsum.py), which loops over all the testing documents, generates a summary, compares it with the manually annotated summary provided by WestLaw, computed and reports the ROUGE score for each document. Following is an explanation of how the top level file operates. 15 | 16 | The sample output is available at [`results_letsum.txt`](results_letsum.txt). The summary is generated as follows: 17 | 18 | We first define the mapping as done previously in the following order: 19 | 20 | | FIRE category | Equivalent LetSum category | 21 | | --- | --- | 22 | | Facts | Introduction | 23 | | Issue | Introduction | 24 | | Arguments | Context | 25 | | Ruling by lower court | Context | 26 | | Statute | Analysis | 27 | | Precedent | Analysis | 28 | | Other general standards | Conclusion | 29 | | Ruling by present court | Conclusion | 30 | 31 | This is required since LetSum follows a template filling task, where each theme is allotted a percentage of the total length of permissible summary: 32 | 33 | | Category | Percentage in summary | 34 | | --- | --- | 35 | | Introduction | 10 | 36 | | Context | 24 | 37 | | Analysis | 60 | 38 | | Conclusion | 6 | 39 | | Citation | 0 | 40 | 41 | The total length of summary is restricted to *34%* of the total document length. Note that unlike Graphical Model, we do not need to train any model beforehand. 42 | 43 | [`letsum_test.py`](letsum_test.py) expects an argument, in form of [`input_sample_test.html`](input_sample_test.html), for the legal document to be summarized. 44 | 45 | ### Labelling each sentence 46 | 47 | For each sentence in the document, we compute a score for that sentence indicating the number of cue phrases / cue pairs present in that sentence. The category with highest score is assigned as the label for that sentence (Introduction/Context/..). In the absence of any cues, a default label of _Context_ is assigned to the sentence. 48 | 49 | ### Ranking importance of sentences 50 | 51 | Once the categories for each sentence has been identified, we want to retrieve the most relevant phrases within that category. Importance is assigned via a heuristic function, consisting of position of paragraph in document, position of sentence in paragraph, TF-IDF, and other factors. However, this function was not available, due to which we rank sentences based on their TF-IDF scores within each category. Hear, we avoid ranking stop words to keep the summary as informative as possible. 52 | 53 | ### Generating the summary 54 | 55 | After ranking the phrases/ words in order of their TF-IDF scores within each category, we sort them in decreasing order and keep adding phrases in the output summary until the threshold of words in that category is reached. 56 | 57 | Say, for instance, a document of length 1600 words will have a summary of at most 1600*34% = 544 words, of which Context will be 24% = 130 words. In the decreasing order of TF-IDF scores, we keep adding words belonging to the category of "Context" until the output summary has less than 130 words. 58 | 59 | ### Grammatic modifications 60 | 61 | Since the phrases might not make sense on their own, certain grammatic corrections have to be made in order for the summary to make sense. However, no further work from the Authors of LetSum focused on this aspect, due to which this phase was not applied in our work. 62 | 63 | -------------------------------------------------------------------------------- /extractive/LetSum/extract-categories.py: -------------------------------------------------------------------------------- 1 | # python3 2 | import os 3 | import re 4 | import json 5 | 6 | ANNOTATED_FOLDER = 'annotated' 7 | CATEGORIES_FOLDER = 'categories' 8 | 9 | category_abbr = { 10 | "Argument": "A", "Fact": "F", "Issue": "I", 11 | "Ruling by lower court": "LR", "Ruling by the present court": "R", 12 | "Statute": "SS", "Precedent": "SP", 13 | "Other general standards including customary equitable and other extra-legal considerations": "SO", 14 | } 15 | 16 | 17 | def extract_loop(): 18 | ''' 19 | run a loop to get all sentences 20 | categories - 21 | ''' 22 | stmts = {} 23 | for file in [doc for doc in os.listdir(ANNOTATED_FOLDER) if doc.split('.')[-1] == 'txt'] 24 | # print('Working on ', file) 25 | with open(ANNOTATED_FOLDER + '/' + file, 'r') as f: 26 | txt = f.readlines() 27 | txt = [i for i in txt if i!='\n'] 28 | start = txt.index('\n') 29 | 30 | text = '\n'.join(txt[start+1:len(txt)-1]) 31 | t2 = re.split("S[0-9]+ ", text) 32 | t2 = list(filter(None, t2)) # remove any empty strings 33 | for line in t2: 34 | category = line.split(' ')[0] 35 | if category == 'FE' or category == 'FI': 36 | category = 'F' 37 | if category not in stmts: 38 | stmts[category] = [] 39 | print(category,' first showed up') 40 | stmts[category].append(' '.join(line.split(' ')[1:])) # remove category 41 | 42 | # JSONs 43 | for file in [doc for doc in os.listdir(ANNOTATED_FOLDER) if doc.split('.')[-1] == 'json'] 44 | with open(ANNOTATED_FOLDER + '/' + file, 'r') as f: 45 | txt = f.readlines() 46 | tj = json.loads(''.join(txt)) 47 | 48 | for tagging in tj['annotation']: 49 | category = category_abbr[ tagging['label'][0] ] 50 | sentences = list(filter(None, tagging['points'][0]['text'].split('\n'))) 51 | if category not in stmts: 52 | stmts[category] = [] 53 | print(category,' first showed up') 54 | 55 | stmts[category].extend(sentences) 56 | 57 | 58 | print('Categories detected: ', ' '.join(stmts.keys())) 59 | 60 | print('Writing into files now') 61 | for category in stmts: 62 | categorized = stmts[category] 63 | categorized = [ i.rstrip() for i in categorized] 64 | categorytext = '\n'.join(categorized) 65 | with open(CATEGORIES_FOLDER + '/' + category + '.txt', 'w') as f: 66 | f.write(categorytext) 67 | print(category + ' done') 68 | 69 | if __name__ == '__main__': 70 | extract_loop() -------------------------------------------------------------------------------- /extractive/LetSum/formulated_constants.py: -------------------------------------------------------------------------------- 1 | categorical_phrases = dict() 2 | categorical_pairs = dict() 3 | # https://www.isical.ac.in/~fire/2013/legal.html 4 | # Cannot unfortunately extract pairs automatically 5 | 6 | ## Fact 7 | # Months 8 | # 'the appellant' excluded because appellant already covered 9 | categorical_phrases['F'] = [ 10 | 'appellant','dated', 'No.', 'State', 'filed', 'Government','registration', 'basis', 11 | 'stated', 'notice', 'period', 'region', 12 | 'January', 'February', 'March','April','May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 13 | 14 | 'the plaintiff', 'the basis', 'basis of', 'Court of', 'stated that', 'filed a', 'was issued', 'the Constitution', 15 | 'the Corporation', 'under Article','plaintiff and', 'show cause', 16 | 17 | 'High court of', 'that the said', 'referred to as', 'stated that the', 'In the year', 18 | ] 19 | 20 | categorical_pairs['F'] = { 21 | } 22 | 23 | ## Issue 24 | 25 | categorical_phrases['I'] = [ 26 | 'appeal by', 27 | 28 | ] 29 | 30 | categorical_pairs['I'] = { 31 | } 32 | 33 | ## Argument 34 | 35 | categorical_phrases['A'] = [ 36 | 'service', 'amount', 'value','section', 'goods','credit','license','supplied', 37 | 'recipient', 'country', 'quota', 'contended','contention', 38 | 39 | 'under Section', 'terms of', 'counsel for', 'before us', 'charged by', 'is charged', 'free of', 40 | 41 | 'charged by the', 'provided by the', 42 | 43 | ] 44 | 45 | categorical_pairs['A'] = { 46 | } 47 | 48 | ## Ruling by lower court 49 | 50 | categorical_phrases['LR'] = [ 51 | 'Tribunal', 'defendants', 'Judge', 'Bench','Division', 'trial', 52 | 53 | 'held that', 'trial court', 'the parent', 54 | 55 | 'that the defendants', 56 | ] 57 | 58 | categorical_pairs['LR'] = { 59 | } 60 | 61 | ## Statute 62 | 63 | categorical_phrases['SS'] = [ 64 | 'provisions', 'provision', 'act', 'section', 'chapter', '(', 'explanation', 'commission', 'agent', 'estates', 65 | 66 | 'of service', 'subject to', 'deemed to', 'or tenure', 67 | 68 | 'as may be', 'the estates of', 69 | 70 | 'the purpose of this', 'in a case where', 'as may be prescribed', 71 | ] 72 | 73 | categorical_pairs['SS'] = { 74 | } 75 | 76 | ## Precedent 77 | 78 | categorical_phrases['SP'] = [ 79 | 'v', 'Vs', 'vs', 'fact', 'SC', '&', 'conclusion', 'finding', 'SCC', 'S.C.R.', 'SCR', 80 | 'carrying', 'observed', 81 | 82 | 'this court', 'of law', 'question of', 'laid down', 'the question', 'was whether', 'decision was', 83 | 'evidence to', 'this Court', 'a finding', 84 | 85 | 'it was held', 'question of fact', 86 | ] 87 | 88 | categorical_pairs['SP'] = { 89 | } 90 | 91 | ## Other general standards 92 | years = [str(x) for x in list(range(1900, 2100))] 93 | categorical_phrases['SO'] = [ 94 | 'Civil', 'Appeal', 95 | 96 | 'Appeal No.', 97 | ] 98 | 99 | categorical_phrases['SO'].extend(years) 100 | categorical_pairs['SO'] = { 101 | } 102 | 103 | ## Ruling by the present court 104 | 105 | categorical_phrases['R'] = [ 106 | 'Dismiss', 'dismissed', 'dismissing', 'sustained', 'rejected', 'allowed','passed','case','considered','aside', 107 | 'preliminary','judgement','accordingly', 'amended', 'allow','owed', 'decree', 108 | 109 | 'the case', 'the Amending', 'set aside', 'order to',' appeal is', 'the Section', 'execution of', 'Appeal allowed', 110 | 111 | 'the part of', 'to the suit', 'of the Court', 112 | 113 | ] 114 | 115 | categorical_pairs['R'] = { 116 | } 117 | 118 | 119 | def include_toggled_list(l): 120 | ''' 121 | If list has summary, it should also have Summary and summary. 122 | ''' 123 | l_ = list() 124 | for item in l: 125 | l_.append(item + '.') 126 | l += l_ 127 | l_ = list() 128 | 129 | for item in l: 130 | first_toggled = item[0].lower() if item[0].isupper() else item[0].upper() 131 | l_.append(first_toggled + item[1:]) 132 | l += l_ 133 | l = list(set(l)) 134 | 135 | 136 | def include_toggled_dict(d): 137 | ''' 138 | If dict has summary: x, it should also have Summary: x 139 | ''' 140 | d_ = dict() 141 | for key in d: 142 | first_toggled = key[0].lower() if key[0].isupper() else key[0].upper() 143 | d_[first_toggled + key[1:]] = d[key] 144 | # d = {**d, **d_} 145 | d.update(d_) 146 | 147 | 148 | def include_toggled_phrases(phrases): 149 | ''' 150 | Iterator for each category 151 | ''' 152 | for category in phrases: 153 | include_toggled_list(phrases[category]) 154 | 155 | 156 | def include_toggled_pairs(pairs): 157 | ''' 158 | Iterator for each categorical pair 159 | ''' 160 | total_pairs, c = sum([ len(pairs[category]) for category in pairs ]), 0 161 | for category in pairs: 162 | # 'If': ['so be'], 163 | # to be extended to 164 | # 'If': ['so be', 'So be'], 'if': ['so be', 'So be'] 165 | 166 | if category == 'arguing': 167 | # print('Passing', category) 168 | for key in pairs[category]: 169 | pairs[category][key] = list( set( pairs[category][key] ) ) 170 | pass 171 | # print('Not passing', category) 172 | for key in pairs[category]: 173 | # print ( pairs[category][key] ) 174 | include_toggled_list( pairs[category][key] ) 175 | c += 1 176 | # print(key, " => ", c, "/", total_pairs) 177 | 178 | c = 0 179 | for category in pairs: 180 | d2 = {} 181 | for key in pairs[category]: 182 | k2 = key[0].upper() if key[0].islower() else key[0].lower() 183 | d2[k2 + key[1:]] = pairs[category][key] 184 | c += 1 185 | # print(key, " => ", c, "/", total_pairs) 186 | pairs[category].update(d2) 187 | 188 | # make a set 189 | for category in pairs: 190 | for key in pairs[category]: 191 | pairs[category][key] = list( set( pairs[category][key] ) ) -------------------------------------------------------------------------------- /extractive/LetSum/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/LetSum/letsum.py: -------------------------------------------------------------------------------- 1 | from rouge import Rouge 2 | import os 3 | import html2text 4 | import nltk 5 | from collections import OrderedDict 6 | import operator 7 | import html2text 8 | import pycrfsuite 9 | import sys 10 | import time 11 | 12 | import letsum_test 13 | 14 | FULL_TEXT = '../../FullText_html/' 15 | MANUAL_SUM = '../../CaseAnalysis/' 16 | MAX_LENGTH_SUMMARY = 100 # Define maxmimum words in summary 17 | # MAX_PERCENT_SUMMARY = 34 # 10 # 10 in paper, 34 as discussed 18 | SUMMARY_PERCENT = 34 19 | REACHED_FILE = '' 20 | 21 | ## Max time taken so far: 3.87 hours 22 | ## 23 | ## Usage: python letsum.py 24 | ## Additionally: python letsum.py from_year to_year reached_file 25 | ## 26 | ## Goes upto, but not including to_year, skips files alphabetically smaller than reached_file 27 | ## 28 | 29 | 30 | # for 2010_B_31 31 | sys.setrecursionlimit(8735 * 2080 + 10) 32 | # reset to 1000 33 | 34 | def summary_letsum_looper(from_year=2010, to_year=2019, reached_file=''): 35 | ''' 36 | Loop over all files from 2010-2018, generate summaries and compute rouge scores 37 | ''' 38 | rouge = Rouge() 39 | scores = OrderedDict() 40 | summaries = {} 41 | REACHED_FILE = reached_file 42 | 43 | i = sum([ len( os.listdir( FULL_TEXT + str(year) ) ) for year in range(2010, from_year)]) + 1 44 | i0 = i-1 45 | # length is total number of documents 46 | length = sum([ len( os.listdir( FULL_TEXT + str(year) ) ) for year in range(2010, 2019)]) 47 | 48 | for year in range(from_year, to_year): 49 | l = sum([len( os.listdir( FULL_TEXT + str(y) ) ) for y in range(from_year, to_year)]) 50 | print('\n\n <=== ', year, ' / ', l, '===>\n\n') 51 | 52 | for case in sorted(os.listdir(FULL_TEXT + str(year))): 53 | print(i, ' / ', length, '=>', case, end='\r') 54 | if REACHED_FILE != '' and case <= REACHED_FILE: 55 | i += 1 56 | continue 57 | i2 = i - i0 58 | 59 | manual_summary = letsum_test.get_manual_summary(MANUAL_SUM + str(year) + '/' + case) 60 | if manual_summary == -1: 61 | print('+- Score ', i2, ' / ', l, ' ', i, ' / ', length, ' ', case, ' => Skipped') 62 | i += 1 63 | continue 64 | 65 | start = time.time() 66 | system_generated_summary = letsum_test.LetSum(FULL_TEXT + str(year) + '/' + case) 67 | running_time = time.time() - start 68 | print(system_generated_summary) 69 | print('Summary generated in ', running_time) 70 | 71 | # available are rouge-1, rouge-2, rouge-l 72 | # print(len(system_generated_summary.split(' '))) 73 | # print('\nManual\n', len(manual_summary.split(' '))) 74 | scores[case] = rouge.get_scores(system_generated_summary, manual_summary)[0] 75 | summaries[case] = system_generated_summary 76 | print('+- Score ', i2, ' / ', l, ' ', i, ' / ', length, ' ', case, ' => ', scores[case]) 77 | i += 1 78 | 79 | sys.setrecursionlimit(1000) # reset 80 | # print('\n\n-----------------------------------------') 81 | # print('Average score for ', len(scores), ' documents') 82 | # print('Fscore: ', sum([scores[case]['f'] for case in scores])) / len(scores) 83 | # print('Precision: ', sum([scores[case]['p'] for case in scores])) / len(scores) 84 | # print('Recall: ', sum([scores[case]['r'] for case in scores])) / len(scores) 85 | 86 | # Not doing top 10, instead handpicked cases 87 | # print('\n Top 10 documents by recall:') 88 | # metric = {case: scores[case]['r'] for case in scores} 89 | # sorted_r = sorted(metric.items(), key=operator.itemgetter(1), reverse=True) 90 | # # printing top 10 rouge scores 91 | # for i in range(10): 92 | # case = sorted_r[i][0] 93 | # print('# ',i+1, ': ', case, ' => ', sorted_r[i][1]) 94 | # print(summaries[case], '\n') 95 | 96 | 97 | if __name__ == '__main__': 98 | from_year, to_year, reached_file = 2010, 2019, '' 99 | if len(sys.argv) > 2: 100 | from_year, to_year = int(sys.argv[1]), int(sys.argv[2]) 101 | if len(sys.argv) > 3: 102 | reached_file = sys.argv[3] 103 | summary_letsum_looper(from_year, to_year, reached_file) -------------------------------------------------------------------------------- /extractive/MMR/MMR.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | import nltk 3 | from nltk.corpus import stopwords 4 | from nltk.tokenize import word_tokenize 5 | import spacy,os 6 | import argparse 7 | import re 8 | from tqdm import tqdm 9 | from collections import OrderedDict 10 | import string 11 | import numpy as np 12 | from spacy.lang.en import English 13 | import time 14 | nl = English() 15 | import sys 16 | import pandas as pd 17 | 18 | repeat = 5 19 | data = [] 20 | doc = [] 21 | l3 = [] 22 | summary = [] 23 | hypothesis = "" 24 | word_count = [] 25 | pair_similarity = [] 26 | summary_string = [] 27 | 28 | def count_word(index): 29 | global doc 30 | Doc = nl(doc[index]) 31 | tokens = [t.text for t in Doc] 32 | tokens = [t for t in tokens if len(t.translate(t.maketrans('', '', string.punctuation + string.whitespace))) > 0] # + string.digits 33 | return len(tokens) 34 | 35 | def store_word_count(): 36 | global word_count,doc 37 | word_count = [] 38 | for i in range(0,len(doc)): 39 | word_count.append(count_word(i)) 40 | 41 | def maximum(index, toPrint=0): 42 | global summary, pair_similarity 43 | length = len(summary) 44 | if(length!=0): 45 | max=0 46 | for i in range(length): 47 | a=pair_similarity[index][summary[i]] 48 | if(a>max): 49 | max=a 50 | if toPrint: 51 | print(str(summary[i])+" -> "+str(a)) 52 | return max 53 | else: 54 | return 0 55 | 56 | def count_sum(summary): 57 | sum=0 58 | length = len(summary) 59 | for i in range(length): 60 | sum+=count_word(summary[i]) 61 | return sum 62 | 63 | def mmr_sorted(lambda_, doc, length): 64 | global word_count, pair_similarity, summary 65 | #print('Inside MMR') 66 | print(length) 67 | l3 = [] 68 | vectorizer = TfidfVectorizer(smooth_idf=False) 69 | X = vectorizer.fit_transform(doc) 70 | y = X.toarray() 71 | rows = y.shape[0] 72 | cols = y.shape[1] 73 | pair_similarity = [] 74 | for i in range(rows): 75 | max=-1 76 | pair_similarity.append([]) 77 | for j in range(rows): 78 | if(j!=i): 79 | a = np.sum(np.multiply(y[i],y[j])) 80 | pair_similarity[-1].append(a) 81 | if(a>max): 82 | max=a 83 | else: 84 | pair_similarity[-1].append(1) 85 | l3.append(max) 86 | store_word_count() 87 | l = len(doc) 88 | count = 0 89 | last = -1 90 | summary = [] 91 | summary_word_count = 0 92 | while(1): 93 | if (summary_word_count < length): 94 | max=-1 95 | for i in range(l): 96 | a = maximum(i) 97 | mmrscore = lambda_*l3[i] - (1-lambda_)*a 98 | if(mmrscore >= max): 99 | max = mmrscore 100 | ind = i 101 | summary.append(ind) 102 | summary_word_count += word_count[ind] 103 | else: 104 | #print('Bye') 105 | break 106 | 107 | def listToString(): 108 | global summary_string, word_count, hypothesis, summary, doc 109 | summary_string = [] 110 | leng = 0 111 | for i in summary: 112 | if doc[i] not in summary_string: 113 | summary_string.append(doc[i]) 114 | leng += word_count[i] 115 | hypothesis = "".join(summary_string) 116 | 117 | 118 | parser = argparse.ArgumentParser() 119 | parser.add_argument('--data_path', default = 'data/text/', type = str, help = 'Folder containing textual data') 120 | parser.add_argument('--summary_path', default = 'data/summaries/', type = str, help = 'Folder to store features of the textual data') 121 | parser.add_argument('--length_file', default = 'data/length.txt', type = str, help = 'Path to file containing summary length') 122 | 123 | 124 | args = parser.parse_args() 125 | 126 | print('Generating summary in ...'+args.summary_path) 127 | num_docs = len(os.listdir(args.data_path)) 128 | 129 | X1 = pd.read_csv(args.length_file, sep="\t", header=None) 130 | 131 | for i in tqdm(range(0,num_docs)): 132 | length1=X1[1][i] 133 | #length2=X2[1][i] 134 | doc = [] 135 | with open(os.path.join(args.data_path,X1[0][i]), 'r') as file: 136 | for x in file: 137 | if x != '\n': 138 | doc.append(x) 139 | lamda=0.6 140 | #for j in lamda: 141 | mmr_sorted(lamda,doc,length1) 142 | listToString() 143 | f= open(os.path.join(args.summary_path,X1[0][i]),"w+") 144 | n = f.write(hypothesis) 145 | f.close() 146 | hypothesis="" 147 | -------------------------------------------------------------------------------- /extractive/MMR/README.md: -------------------------------------------------------------------------------- 1 | # MMR 2 | 3 | Implementation of the paper [Automatic Summarization of Legal Decisions using Iterative Masking of Predictive Sentences](https://dl.acm.org/doi/10.1145/3322640.3326728) ICAIL, 2019 4 | 5 | 6 | ### Basic command to run: 7 | `python MMR.py --data_path /path/to/documents/ --summary_path /path.to/save/summaries/ --length_file lengthfile.txt` 8 | 9 | 10 | ### format of length file 11 | 12 | ``` 13 | filename required-summary-length-in-words 14 | ``` 15 | 16 | ### External libraries required 17 | 18 | - spacy = 2.2.3 19 | - nltk = 3.5 20 | - sklearn = 0.21.3 21 | - tqdm 22 | -------------------------------------------------------------------------------- /extractive/MMR/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/RBM/README.md: -------------------------------------------------------------------------------- 1 | # TextSummarizer 2 | Implementation of the paper [Extractive Summarization using Deep Learning](https://arxiv.org/pdf/1708.04439.pdf) 3 | 4 | Description: Used two layers of Restricted Boltzmann Machine as a Deep Belief Network to enhance and abstract various features such as named entities, proper nouns, numeric tokens, sentence position etc. to score sentences then selecting the top scores, hence producing an extractive summary. 5 | 6 | ## Prerequisites 7 | * [Python 2.7+](https://www.python.org/download/releases/2.7/) 8 | * [Numpy](http://www.numpy.org/) 9 | * [Scipy](https://www.scipy.org/) 10 | * [Theano](https://github.com/Theano/Theano) 11 | * [NLTK](http://www.nltk.org/) 12 | 13 | ## Getting Started 14 | 1. Clone this repository
15 | 2. Run `python Summarizer.py` to summarize the articles. The first argument will be the directory containing the input files and second argument will be the directory where outout will be stored
16 | -------------------------------------------------------------------------------- /extractive/RBM/enhancedfeatureSum: -------------------------------------------------------------------------------- 1 | -4.14446496262 2 | -5.22418720821 3 | -2.08044325936 4 | 3.51808975667 5 | -9.1456816337 6 | 2.28970787758 7 | -1.31615384616 8 | -5.99801672717 9 | 5.89963260743 10 | -6.83383889168 11 | 1.37234532214 12 | 1.13257516088 13 | -8.99964700893 14 | 6.37152248479 15 | -4.7296241192 16 | -3.30522300491 17 | 4.0947679278 18 | -8.66964930836 19 | 0.865994550113 20 | 0.0395620901692 21 | -------------------------------------------------------------------------------- /extractive/RBM/entity2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | sample = "Narenda Modi is the pm of India" 4 | 5 | def extract_entity_names(t): 6 | entity_names = [] 7 | 8 | if hasattr(t, 'label') and t.label: 9 | if t.label() == 'NE': 10 | entity_names.append(' '.join([child[0] for child in t])) 11 | else: 12 | for child in t: 13 | entity_names.extend(extract_entity_names(child)) 14 | 15 | return entity_names 16 | def ner(sample): 17 | try: 18 | sentences = nltk.sent_tokenize(sample) 19 | except: 20 | raise Exception 21 | tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] 22 | tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] 23 | chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True) 24 | 25 | 26 | 27 | entity_names = [] 28 | for tree in chunked_sentences: 29 | # Print results per sentence 30 | # print extract_entity_names(tree) 31 | 32 | entity_names.extend(extract_entity_names(tree)) 33 | print set(entity_names) 34 | return len(entity_names) -------------------------------------------------------------------------------- /extractive/RBM/extractiveDemo.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import os 3 | 4 | def readPaths(path): 5 | # Create array of array of images. 6 | print(path) 7 | imagePaths = [] 8 | filePaths = [] 9 | # List all files in the directory and read points from text files one by one 10 | for filePath in sorted(os.listdir(path)): 11 | # print("10 ", filePath) 12 | fileExt = os.path.splitext(filePath)[1] 13 | if fileExt in [".html"]: 14 | # print (filePath) 15 | filePaths.append(filePath) 16 | 17 | # Add to array of images 18 | imagePaths.append(os.path.join(path, filePath)) 19 | 20 | return imagePaths, filePaths 21 | 22 | print('Enter directory for text files') 23 | dirname2 = input() 24 | for dirs in os.listdir(dirname2): 25 | if dirs in ['2013', '2014']: 26 | # print(type(dirs)) 27 | if os.path.isdir(os.path.join(dirname2,dirs)): 28 | summaryFiles, names = readPaths(os.path.join(os.path.join(dirname2,dirs),'CaseAnalysis')) 29 | # print("hello") 30 | textFiles, names = readPaths(os.path.join(os.path.join(dirname2,dirs),'FullText')) 31 | # print(summaryFiles) 32 | # print(textFiles) 33 | for count in range(0,min(len(summaryFiles),len(textFiles))): 34 | HtmlFile = open(textFiles[count], 'r', encoding='utf-8') 35 | source_code = HtmlFile.read() 36 | soup = BeautifulSoup(source_code, 'html.parser') 37 | allDiv = soup.find_all('div', class_='locatorSection') 38 | #f = open("Summary_7.txt","wb") 39 | text = str() 40 | for i in allDiv: 41 | #print(i.find_all('h3')) 42 | flag = 0 43 | for h3tag in i.find_all('h3'): 44 | if h3tag.get('id') == 'casedigest': 45 | flag = 1 46 | break 47 | if flag == 1: 48 | for p in i.find_all('p'): 49 | if p.find('strong') != None: 50 | if p.find('strong').text == 'Summary:': 51 | for tag in p.find_all('strong'): 52 | tag.replaceWith('') 53 | # print(p.text) 54 | text = p.text 55 | #f.write(bytes(text,'UTF-8')) 56 | 57 | #f.close() 58 | if any(os.path.basename(textFiles[count]) in os.path.basename(s) for s in textFiles): 59 | HtmlFile = open(textFiles[count], 'r', encoding='utf-8') 60 | source_code = HtmlFile.read() 61 | soup = BeautifulSoup(source_code, 'html.parser') 62 | allDiv = soup.find_all('div', class_='docContent') 63 | 64 | f = open("Extractive/demo_test/"+dirs+"_"+names[count]+".txt","w") 65 | print("64 ", dirs+"_"+names[count]) 66 | for i in allDiv: 67 | footNotes = i.find_all('p', class_='small center') 68 | for tag in footNotes: 69 | tag.replaceWith('') 70 | # print(i.text) 71 | # f.write(i.text) 72 | # f.write("\n\n@highlight\n") 73 | f.write(text) 74 | 75 | f.close() 76 | -------------------------------------------------------------------------------- /extractive/RBM/featureSum: -------------------------------------------------------------------------------- 1 | 4.51799738281 2 | 1.45031689174 3 | 1.65310050155 4 | 0.257102174852 5 | 3.07559906141 6 | 2.37254781826 7 | 0.679321453233 8 | 1.58868062067 9 | 0.0396091840985 10 | 1.5214609106 11 | 0.472780015949 12 | 0.590904641577 13 | 3.63993420881 14 | -0.525165960839 15 | 1.52273977015 16 | 1.21188091948 17 | 0.402717131627 18 | 2.26820518638 19 | 2.50662999217 20 | -0.00553588031303 21 | -------------------------------------------------------------------------------- /extractive/RBM/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/RBM/ner.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | 4 | # tokenize doc 5 | with open('Texts/Text_1.txt','r') as f: 6 | doc = f.read() 7 | 8 | sent_text = nltk.sent_tokenize(doc) 9 | 10 | op = "" 11 | 12 | for sent in sent_text: 13 | tokenized_text = nltk.word_tokenize(sent) 14 | num_present = [(x in summ_word_tokens) for x in tokenized_text] 15 | num_present = sum(num_present * 1) 16 | if num_present / len(tokenized_text) < 0.25: 17 | score = 0 18 | elif num_present / len(tokenized_text) > 0.75: 19 | score = 2 20 | else: 21 | score = 1 22 | op += sent + ' \t\t' + str(score) + '\n' 23 | 24 | print(op) 25 | 26 | 27 | tokenized_doc = nltk.word_tokenize(doc) 28 | # tag sentences and use nltk's Named Entity Chunker 29 | tagged_sentences = nltk.pos_tag(tokenized_doc) 30 | ne_chunked_sents = nltk.ne_chunk(tagged_sentences) 31 | 32 | # extract all named entities 33 | 34 | named_entities = [] 35 | count = 1 36 | entity_list="" 37 | 38 | 39 | for tagged_tree in ne_chunked_sents: 40 | if hasattr(tagged_tree, 'label'): 41 | entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) # 42 | entity_type = tagged_tree.label() # get NE category 43 | named_entities.append((entity_name, entity_type)) 44 | 45 | named_entities = list(set(named_entities)) 46 | # print(named_entities) 47 | for entities in named_entities: 48 | rep_string = "@entity"+str(count) 49 | doc = doc.replace(entities[0], rep_string) 50 | entity_list+="@entity"+str(count)+":"+entities[0]+"\n" 51 | count+=1 52 | 53 | sentences = nltk.sent_tokenize(doc) 54 | 55 | # doc+="\n"+entity_list 56 | print(doc) -------------------------------------------------------------------------------- /extractive/RBM/para_reader.py: -------------------------------------------------------------------------------- 1 | 2 | import readline 3 | 4 | 5 | class Paragraphs: 6 | 7 | def __init__(self, fileobj, separator='\n'): 8 | 9 | # Ensure that we get a line-reading sequence in the best way possible: 10 | #import xreadlines 11 | try: 12 | # Check if the file-like object has an xreadlines method 13 | self.seq = fileobj.readlines() 14 | except AttributeError: 15 | # No, so fall back to the xreadlines module's implementation 16 | self.seq = xreadlines.readlines(fileobj) 17 | 18 | self.line_num = 0 # current index into self.seq (line number) 19 | self.para_num = 0 # current index into self (paragraph number) 20 | 21 | # Ensure that separator string includes a line-end character at the end 22 | if separator[-1:] != '\n': separator += '\n' 23 | self.separator = separator 24 | 25 | 26 | def __getitem__(self, index): 27 | if index != self.para_num: 28 | raise TypeError, "Only sequential access supported" 29 | 30 | self.para_num += 1 31 | # Start where we left off and skip 0+ separator lines 32 | while 1: 33 | # Propagate IndexError, if any, since we're finished if it occurs 34 | line = self.seq[self.line_num] 35 | #print "line : ",line 36 | self.line_num += 1 37 | if line != self.separator: break 38 | # Accumulate 1+ nonempty lines into result 39 | result = [line] 40 | while 1: 41 | # Intercept IndexError, since we have one last paragraph to return 42 | try: 43 | # Let's check if there's at least one more line in self.seq 44 | line = self.seq[self.line_num] 45 | #print "line 2 : ",line 46 | except IndexError: 47 | # self.seq is finished, so we exit the loop 48 | break 49 | # Increment index into self.seq for next time 50 | self.line_num += 1 51 | result.append(line) 52 | if line == self.separator: break 53 | 54 | return ''.join(result) 55 | 56 | # Here's an example function, showing how to use class Paragraphs: 57 | def show_paragraphs(filename, numpars=20): 58 | paralist = [] 59 | pp = Paragraphs(open(filename)) 60 | for p in pp: 61 | #print "Par#%d : %s" % (pp.para_num, repr(p)) 62 | paralist.append(p) 63 | if pp.para_num>numpars: break 64 | 65 | return paralist 66 | #print(show_paragraphs('article2')) 67 | 68 | -------------------------------------------------------------------------------- /extractive/RBM/parseForExtract.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import os 3 | 4 | def readPaths(path): 5 | # Create array of array of images. 6 | print(path) 7 | imagePaths = [] 8 | filePaths = [] 9 | # List all files in the directory and read points from text files one by one 10 | for filePath in sorted(os.listdir(path)): 11 | # print("10 ", filePath) 12 | fileExt = os.path.splitext(filePath)[1] 13 | if fileExt in [".html"]: 14 | # print (filePath) 15 | filePaths.append(filePath) 16 | 17 | # Add to array of images 18 | imagePaths.append(os.path.join(path, filePath)) 19 | 20 | return imagePaths, filePaths 21 | 22 | print('Enter directory for text files') 23 | dirname2 = input() 24 | for dirs in os.listdir(dirname2): 25 | # print(type(dirs)) 26 | if os.path.isdir(os.path.join(dirname2,dirs)): 27 | summaryFiles, names = readPaths(os.path.join(os.path.join(dirname2,dirs),'CaseAnalysis')) 28 | # print("hello") 29 | textFiles, names = readPaths(os.path.join(os.path.join(dirname2,dirs),'FullText')) 30 | # print(summaryFiles) 31 | # print(textFiles) 32 | 33 | for count in range(0,min(len(summaryFiles),len(textFiles))): 34 | HtmlFile = open(summaryFiles[count], 'r', encoding='utf-8') 35 | source_code = HtmlFile.read() 36 | soup = BeautifulSoup(source_code, 'html.parser') 37 | allDiv = soup.find_all('div', class_='locatorSection') 38 | #f = open("Summary_7.txt","wb") 39 | text = str() 40 | for i in allDiv: 41 | #print(i.find_all('h3')) 42 | flag = 0 43 | for h3tag in i.find_all('h3'): 44 | if h3tag.get('id') == 'casedigest': 45 | flag = 1 46 | break 47 | if flag == 1: 48 | for p in i.find_all('p'): 49 | if p.find('strong') != None: 50 | if p.find('strong').text == 'Summary:': 51 | for tag in p.find_all('strong'): 52 | tag.replaceWith('') 53 | # print(p.text) 54 | text = p.text 55 | #f.write(bytes(text,'UTF-8')) 56 | 57 | #f.close() 58 | if any(os.path.basename(summaryFiles[count]) in os.path.basename(s) for s in textFiles): 59 | HtmlFile = open(textFiles[count], 'r', encoding='utf-8') 60 | source_code = HtmlFile.read() 61 | soup = BeautifulSoup(source_code, 'html.parser') 62 | allDiv = soup.find_all('div', class_='docContent') 63 | 64 | f = open("NeuralSum/"+dirs+"_"+names[count]+".txt","w") 65 | print("64 ", dirs+"_"+names[count]) 66 | for i in allDiv: 67 | footNotes = i.find_all('p', class_='small center') 68 | for tag in footNotes: 69 | tag.replaceWith('') 70 | # print(i.text) 71 | f.write(i.text) 72 | # f.write("\n\n@highlight\n") 73 | #print('************') 74 | #print(text) 75 | #print('************') 76 | #print(i.text) 77 | #print('************') 78 | # f.write(text) 79 | 80 | f.close() 81 | -------------------------------------------------------------------------------- /extractive/RBM/plot.py: -------------------------------------------------------------------------------- 1 | import plotly.plotly as py 2 | import plotly.graph_objs as go 3 | 4 | trace1 = go.Scatter( 5 | x=[1, 2, 3, 4, 5, 6 | 6, 7, 8, 9, 10, 7 | 11, 12, 13, 14, 15], 8 | y=[10, 20, None, 15, 10, 9 | 5, 15, None, 20, 10, 10 | 10, 15, 25, 20, 10], 11 | name = 'No Gaps', # Style name/legend entry with html tags 12 | connectgaps=True 13 | ) 14 | trace2 = go.Scatter( 15 | x=[1, 2, 3, 4, 5, 16 | 6, 7, 8, 9, 10, 17 | 11, 12, 13, 14, 15], 18 | y=[5, 15, None, 10, 5, 19 | 0, 10, None, 15, 5, 20 | 5, 10, 20, 15, 5], 21 | name = 'Gaps', 22 | ) 23 | 24 | data = [trace1, trace2] 25 | 26 | fig = dict(data=data) 27 | py.iplot(fig, filename='simple-connectgaps') 28 | -------------------------------------------------------------------------------- /extractive/RBM/plotLines.py: -------------------------------------------------------------------------------- 1 | import plotly.plotly as py 2 | import plotly.graph_objs as go 3 | 4 | # trace1 = go.Scatter( 5 | # x=[1, 2, 3, 4, 5, 6 | # 6, 7, 8, 9, 10, 7 | # 11, 12, 13, 14, 15], 8 | # y=[10, 20, None, 15, 10, 9 | # 5, 15, None, 20, 10, 10 | # 10, 15, 25, 20, 10], 11 | # name = 'No Gaps', # Style name/legend entry with html tags 12 | # connectgaps=True 13 | # ) 14 | trace1 = go.Scatter( 15 | y=[0.777777777778,0.8,0.866666666667,0.833333333333,0.791666666667,0.833333333333,0.861111111111,0.836363636364], 16 | x=[16,20,30,35,47,61,71,109], 17 | name = 'No Gaps', # Style name/legend entry with html tags 18 | connectgaps=True 19 | ) 20 | 21 | 22 | # trace2 = go.Scatter( 23 | # y=[0.823, 0.677,0.75,0.79], 24 | # x=[25,30,40,45], 25 | # name = 'Gaps' 26 | # ) 27 | 28 | 29 | # trace2 = go.Scatter( 30 | # x=[1, 2, 3, 4, 5, 31 | # 6, 7, 8, 9, 10, 32 | # 11, 12, 13, 14, 15], 33 | # y=[5, 15, None, 10, 5, 34 | # 0, 10, None, 15, 5, 35 | # 5, 10, 20, 15, 5], 36 | # name = 'Gaps', 37 | # ) 38 | 39 | data = [trace1] 40 | 41 | fig = dict(data=data) 42 | py.iplot(fig, filename='precision') 43 | -------------------------------------------------------------------------------- /extractive/RBM/saveCre.py: -------------------------------------------------------------------------------- 1 | import plotly 2 | plotly.tools.set_credentials_file(username='vagisha', api_key='Qcl3DGmQ8oWR81iAHOlV') -------------------------------------------------------------------------------- /extractive/RBM/sentence_labeller.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import glob 3 | import tqdm 4 | import pdb 5 | 6 | for file in tqdm.tqdm(glob.glob('./Test_Again/*.txt')): 7 | with open(file, 'r') as f: 8 | text = f.read() 9 | f.close() 10 | with open('./Extractive/ref_summ/' + file[13:], 'r') as f: 11 | summ_sent = f.read() 12 | f.close() 13 | 14 | sent_text = nltk.sent_tokenize(text) 15 | 16 | summ_word_tokens = nltk.word_tokenize(summ_sent) 17 | 18 | op = '' 19 | 20 | for sent in sent_text: 21 | tokenized_text = nltk.word_tokenize(sent) 22 | num_present = [x in summ_word_tokens for x in tokenized_text] 23 | num_present = sum(num_present * 1) 24 | # pdb.set_trace() 25 | if num_present / len(tokenized_text) < 0.25: 26 | score = 0 27 | elif num_present / len(tokenized_text) > 0.75: 28 | score = 2 29 | else: 30 | score = 1 31 | op += sent + ' \t\t' + str(score) + '\n' 32 | 33 | tokenized_doc = nltk.word_tokenize(op) 34 | # tag sentences and use nltk's Named Entity Chunker 35 | tagged_sentences = nltk.pos_tag(tokenized_doc) 36 | ne_chunked_sents = nltk.ne_chunk(tagged_sentences) 37 | 38 | # extract all named entities 39 | 40 | named_entities = [] 41 | count = 1 42 | entity_list="" 43 | 44 | 45 | for tagged_tree in ne_chunked_sents: 46 | if hasattr(tagged_tree, 'label'): 47 | entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) # 48 | entity_type = tagged_tree.label() # get NE category 49 | named_entities.append((entity_name, entity_type)) 50 | 51 | named_entities = list(set(named_entities)) 52 | # print(named_entities) 53 | for entities in named_entities: 54 | rep_string = "@entity"+str(count) 55 | op = op.replace(entities[0], rep_string) 56 | summ_sent = summ_sent.replace(entities[0], rep_string) 57 | entity_list+="@entity"+str(count)+":"+entities[0]+"\n" 58 | count+=1 59 | 60 | # print(op) 61 | # break 62 | op+="\n\n"+summ_sent 63 | op+="\n\n"+entity_list 64 | with open('./Extractive/test_annotated_again/' + file[13:], 'w') as f: 65 | f.write(op) 66 | f.close() 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /extractive/abs_to_ext/README.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | This code converts an abstractive gold standard summary to extractive. Given the full document and its abstractive gold standard summary, the code labels each sentence in the full document either 1 (meaning that it is a summary sentence) or 0 (meaning that it is not a summary sentence). 4 | 5 | The extractive training data thus generated can be used to train extractive supervised summarization methods like [Gist](https://github.com/Law-AI/summarization/tree/aacl/extractive/Gist). 6 | 7 | ### Basic command to run: 8 | `python extractive_labels.py /path/to/base/directory` 9 | 10 | ### To view other options (show help): 11 | `python extractive_labels.py -h` 12 | 13 | ### External libraries required: 14 | - numpy 15 | - tqdm 16 | - [rouge](https://pypi.org/project/rouge/) 17 | 18 | 19 | ### Method reference 20 | You can use one of the two methods for generating labels: 21 | - **avr_rg**: This method selects 3 sentences from the full text for each sentence in the summary wrt the highest average ROUGE scores. Ref: [Narayan.et.al. NAACL 2018](https://www.aclweb.org/anthology/N18-1158/) 22 | - **m_rg**: This method selects greedily selects the maximal sentences from full text to maximize ROUGE scores wrt the summary. Ref: [Nallapati.et.al. AAAI 2017](https://ojs.aaai.org/index.php/AAAI/article/view/10958) 23 | -------------------------------------------------------------------------------- /extractive/abs_to_ext/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/annotated/SAKHKKAR_MILLS_MAZDOOR_SANGH-annotated.txt: -------------------------------------------------------------------------------- 1 | 2 | SAKHKKAR MILLS MAZDOOR SANGH V. GWALIOR SUGAR CO. LTD. [1985] INSC 32; AIR 1985 SC 758; 1985 (2) SCR 958; 1985 (2) SCC 134; 1985 (1) SCALE 301 (22 February 1985) 3 | 4 | 5 | 6 | 7 | S1 FI The first respondent is a company engaged in the manufacture of sugar and employing over 1100 workers, 300 of them on a permanent basis and 800 on a seasonal basis. 8 | 9 | S2 FI The permanent employees are those employed on the clerical side and in the operation and maintenance of machines. 10 | 11 | S3 FI The other 800 employees are seasonal employees who are so employed because the factory itself does not work through out the year but works during a certain season every year from December when the sugarcane crop is ready for crushing until the crushing is over. 12 | 13 | S4 FI The employer refused to pay bonus to the seasonal employees during the year 1964-65 on the ground that they were not employed through out the year. 14 | 15 | S5 FI A dispute arose between the management and the Mazdoor Sangh which was referred to the Industrial Court Madhya Pradesh, Indore for arbitration under sec.49 of the Madhya Pradesh Industrial Relations Act. 16 | 17 | S6 LR The Industrial Court decided in favour of the workers, but on a writ petition filed by the company, the award of the arbitrator was quashed and it was held that the workers were only entitled to proportionate bonus and not the minimum bonus 960 guaranteed by sec. 10 of the Payment of Bonus Act, 1965. 18 | 19 | S7 I This appeal has been filed by the Mazdoor Sangh under a certificate granted by the High Court of Madhya Pradesh. 20 | 21 | S8 I To our minds the question is a simple one and is capable of only one answer. 22 | 23 | S9 SS Sections 10 and 13 of the Payment of Bonus Act, as they stood at the relevant time, were as follows: Section 10. 24 | 25 | S10 SS PAYMENT OF MINIMUM BONUS- Subject to the provisions of ss 8 and 13 every employer shall be bound to pay to every employee in an accounting year a minimum bonus which shall be four per cent of the salary or wage earned by the employee during the accounting year or forty rupees, whichever is higher, whether there are profits in the accounting year or not: 26 | Provided that where such employee has not completed fifteen years of age at the beginning of the accounting year, the provisions of this section shall have effect in relation to such employee as if for the words 'forty rupees', the words 'twenty five rupees" were substituted. 27 | 28 | S11 SS Section13-PROPORTIONATE REDUCTION OF BONUS IN CERTAIN CASES-Where an employee has not worked for all the working days in any accounting year, the minimum bonus of forty rupees or, as the case may be, of twenty five rupees, per cent, of his salary or wage for the days he has worked in that accounting year shall be proportionately reduced. 29 | 30 | 31 | S12 LR The High Court has interpreted the words "working days in any accounting year" as meaning all those days of the year except holidays. 32 | 33 | S13 R While such an interpretation may be alright in the case of a factory which works all through the year, it would be hardly appropriate in the case of a factory which works during a particular season every year. 34 | 35 | S14 R In the case of a factory which works seasonally during an accounting year, ' working days in any accounting year" can only mean those days of the year during which the employee concerned is actually allowed to work. 36 | 37 | S15 LR That was the interpretation which was placed upon the expression by the Industrial Court 38 | 39 | S16 R and we think it is the proper interpretation. 40 | 41 | S17 R Having regard to the scheme and the purpose of the Act, we do not think that the High Court was justified in placing a different construction on the meaning of the expression "working days in any accounting year. 42 | 43 | S18 R We, therefore, set aside the judgment of the High Court and restore the award of the Industrial Court. 44 | 45 | S19 R The bonus payable to the employees will carry interest at the rate of nine per cent per annum from the day when the bonus became due until the date of payment. 46 | 47 | S20 R The appeal is allowed with costs. 48 | 49 | S21 R N.V.K Appeal allowed. 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /extractive/annotated/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /extractive/top_ngrams/gitkeep: -------------------------------------------------------------------------------- 1 | 2 | --------------------------------------------------------------------------------