├── .gitignore
├── README.md
├── abstractive
    ├── BART_based_approaches
    │   ├── BART_fineTune.ipynb
    │   ├── BART_utilities.py
    │   ├── Readme.md
    │   ├── generate_summaries_chunking_BART.ipynb
    │   ├── generate_summaries_chunking_BART_RR.ipynb
    │   └── generate_summaries_chunking_BERT_BART.ipynb
    ├── BertSum-Abs
    │   ├── PreSumm
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── bert_data
    │   │   │   └── .gitignore
    │   │   ├── json_data
    │   │   │   └── cnndm_sample.train.0.json
    │   │   ├── logs
    │   │   │   └── .gitignore
    │   │   ├── models
    │   │   │   └── .gitignore
    │   │   ├── raw_data
    │   │   │   ├── temp.raw_src
    │   │   │   ├── temp.raw_tgt
    │   │   │   └── temp_ext.raw_src
    │   │   ├── requirements.txt
    │   │   ├── results
    │   │   │   └── .gitignore
    │   │   ├── src
    │   │   │   ├── cal_rouge.py
    │   │   │   ├── distributed.py
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── adam.py
    │   │   │   │   ├── data_loader.py
    │   │   │   │   ├── decoder.py
    │   │   │   │   ├── encoder.py
    │   │   │   │   ├── loss.py
    │   │   │   │   ├── model_builder.py
    │   │   │   │   ├── neural.py
    │   │   │   │   ├── optimizers.py
    │   │   │   │   ├── predictor.py
    │   │   │   │   ├── reporter.py
    │   │   │   │   ├── reporter_ext.py
    │   │   │   │   ├── trainer.py
    │   │   │   │   └── trainer_ext.py
    │   │   │   ├── others
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── logging.py
    │   │   │   │   ├── pyrouge.py
    │   │   │   │   ├── tokenization.py
    │   │   │   │   └── utils.py
    │   │   │   ├── post_stats.py
    │   │   │   ├── prepro
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── data_builder.py
    │   │   │   │   ├── smart_common_words.txt
    │   │   │   │   └── utils.py
    │   │   │   ├── preprocess.py
    │   │   │   ├── train.py
    │   │   │   ├── train_abstractive.py
    │   │   │   ├── train_extractive.py
    │   │   │   └── translate
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── beam.py
    │   │   │   │   └── penalties.py
    │   │   └── urls
    │   │   │   ├── cnn_mapping_test.txt
    │   │   │   ├── cnn_mapping_train.txt
    │   │   │   ├── cnn_mapping_valid.txt
    │   │   │   ├── mapping_test.txt
    │   │   │   └── mapping_valid.txt
    │   ├── Readme.md
    │   └── chunker.ipynb
    ├── BigBird
    │   ├── Pretrained_Bigbird.ipynb
    │   └── Readme.md
    ├── Generate_fine_tuning_data
    │   ├── Readme.md
    │   ├── gen_fine_tuning_data_CLS.ipynb
    │   ├── gen_fine_tuning_data_MCS.ipynb
    │   ├── gen_fine_tuning_data_MCS_RR.ipynb
    │   └── gen_fine_tuning_data_SIF.ipynb
    ├── Legal-LED
    │   ├── Legal-LED_Finetune.ipynb
    │   ├── Legal-LED_generate_summary.ipynb
    │   └── Readme.md
    ├── Legal-Pegasus
    │   ├── Readme.md
    │   ├── pegasus-test.ipynb
    │   └── pegasus_finetune.ipynb
    ├── Pointer Generator
    │   ├── chunker.ipynb
    │   ├── combiner.ipynb
    │   └── readme.md
    ├── README.md
    └── utilities.py
├── dataset
    └── README.md
└── extractive
    ├── CaseSummarizer
        ├── README.md
        ├── dictionary.txt
        ├── gitkeep
        ├── preprocess.py
        ├── sample folder
        │   ├── gitkeep
        │   └── sample input.txt
        ├── sample_processed
        │   ├── gitkeep
        │   └── sample input.txt
        ├── sample_shortsumm
        │   ├── gitkeep
        │   └── sample input.txt
        ├── sample_summary
        │   ├── gitkeep
        │   └── sample input.txt
        ├── summary.py
        └── summary_length.py
    ├── Gist
        ├── README.md
        ├── codes
        │   ├── generate_features.py
        │   ├── generate_summary.py
        │   ├── gitkeep
        │   ├── imp_words.py
        │   ├── open_words.py
        │   ├── pos_tags.py
        │   ├── quant_feats.py
        │   ├── sent_pos.py
        │   ├── train_classifier.py
        │   ├── word2vec.py
        │   └── word_emb.py
        ├── cue_phrases.txt
        ├── generate_features.py
        ├── gitkeep
        ├── infer.py
        ├── length.txt
        ├── postags.txt
        └── train.py
    ├── GraphicalModel
        ├── README.md
        ├── crf_alltrain.model
        ├── crf_test.py
        ├── crf_train.py
        ├── extract-categories.py
        ├── formulated_constants.py
        ├── get-n-grams.py
        ├── gitkeep
        ├── graphicalModel.py
        ├── k_mix_model_test.py
        ├── k_mix_output.txt
        └── results_crf.txt
    ├── LetSum
        ├── README.md
        ├── extract-categories.py
        ├── formulated_constants.py
        ├── get-n-grams.py
        ├── gitkeep
        ├── letsum.py
        ├── letsum_test.py
        ├── modified_letsum.py
        └── results_letsum.txt
    ├── MMR
        ├── MMR.py
        ├── README.md
        └── gitkeep
    ├── RBM
        ├── README.md
        ├── Summarizer.py
        ├── enhancedfeatureSum
        ├── entity2.py
        ├── extractiveDemo.py
        ├── featureSum
        ├── gitkeep
        ├── logistic_sgd.py
        ├── ner.py
        ├── numofSen.py
        ├── para_reader.py
        ├── parseForExtract.py
        ├── plot.py
        ├── plotLines.py
        ├── rbm.py
        ├── saveCre.py
        └── sentence_labeller.py
    ├── README.md
    ├── abs_to_ext
        ├── README.md
        ├── extractive_labels.py
        └── gitkeep
    ├── annotated
        ├── A.R._KRISHNAMURTHY-annotated.txt
        ├── AMAR_KANT_CHOUDHARY-annotated.txt
        ├── Andhra_sugars.json
        ├── Archit.json
        ├── BIHARI_CHOWDHARY-annotated.txt
        ├── BISHNU_CHAND_LAL_CHAUDHARY-annotated.txt
        ├── Bhayana.json
        ├── Chandrachud.json
        ├── Commissioner_delhi.json
        ├── GANPAT_GIRI-annotated.txt
        ├── GUNENDRA_PRASAD_SEN_GUPTA-annotated.txt
        ├── Gwalior.json
        ├── Hameedullah.json
        ├── JCShah.json
        ├── KRISHNA_ALIAS_RAJU-annotated.txt
        ├── KUMAR_SUDHENDU_NARAIN_DEB-annotated.txt
        ├── Kapur.json
        ├── Ramaswamy.json
        ├── SAKHKKAR_MILLS_MAZDOOR_SANGH-annotated.txt
        ├── Sabyasachi_Mukherjee.json
        ├── Subbarao.json
        ├── V_P_Sayed_Mohammed-Iteration_02-annotated.txt
        ├── Writ_petition.json
        ├── Write_petition_civil.json
        ├── gitkeep
        └── kalawati.json
    └── top_ngrams
        ├── gitkeep
        ├── top_bigrams.txt
        ├── top_quadrigrams.txt
        ├── top_trigrams.txt
        └── top_unigrams.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | This repository contains implementations and datasets made available in the following papers :
 4 | 
 5 | - Legal Case Document Summarization: Extractive and Abstractive Methods and their Evaluation accepted at AACL-IJCNLP 2022.
 6 | - A Comparative Study of Summarization Algorithms Applied to Legal Case Judgments accepted at ECIR 2019.
 7 | 
 8 | We provide the code base of various Extractive and Abstractive summarization algorithms applied to legal case documents. We also provide three summarization datasets from the Indian and UK Supreme Court case documents.
 9 | 
10 | # Dataset
11 | 
12 | We make available 3 legal document summarization datasets. Please refer to the *dataset* folder for more details :
13 | 
14 | - IN-Abs : Indian Supreme Court case documents & their *abstractive* summaries, obtained from http://www.liiofindia.org/in/cases/cen/INSC/
15 | - IN-Ext : Indian Supreme Court case documents & their *extractive* summaries, written by two law experts (A1, A2).
16 | - UK-Abs : United Kingdom (U.K.) Supreme Court case documents & their *abstractive* summaries, obtained from https://www.supremecourt.uk/decided-cases/
17 | 
18 | 
19 | # Extractive Summarization methods
20 | 
21 | We provide the implementations of the following methods in the *extractive* folder of the repository :
22 | 
23 | - [Improving Legal Document Summarization Using Graphical Models](https://www.cse.iitm.ac.in/~ravi/papers/Saravanan_jurix_06.pdf), JURIX 2006
24 | - [LetSum, a Text Summarization system in Law field](http://rali.iro.umontreal.ca/rali/?q=en/node/673), JURIX 2004
25 | - [CaseSummarizer](http://www.aclweb.org/anthology/C16-2054), COLING 2016
26 | - [Automatic Summarization of Legal Decisions using Iterative Masking of Predictive Sentences](https://dl.acm.org/doi/10.1145/3322640.3326728), ICAIL 2019
27 | - [Extracting the Gist of Chinese Judgments of the Supreme Court](https://dl.acm.org/doi/10.1145/3322640.3326715), ICAIL 2019 
28 | - [Extractive Summarization using Deep Learning](https://arxiv.org/pdf/1708.04439.pdf), arXiv preprint, 2017
29 | - We also provide the script to convert an abstractive gold standard summary to extractive, for training supervised extractive summarization methods.
30 | 
31 | 
32 | # Abstractive Summarization methods
33 | 
34 | - Scripts to generate fine tuning data using following techniques
35 |   - CLS
36 |   - SIF
37 |   - MCS
38 |   - MCS_RR
39 | - Scripts to fine-tune following models
40 |   - [BART](https://huggingface.co/facebook/BART_large)
41 |   - [legal-Pegasus](https://huggingface.co/nsi319/legal-pegasus)
42 |   - [Legal-LED](https://huggingface.co/nsi319/legal-led-base-16384)
43 | - Scripts to generate summaries using following models/methods 
44 |   - BART(CLS, MCS, SIF)
45 |   - BART_RR
46 |   - BERT-BART
47 |   - Legal-LED
48 |   - Pegasus
49 | - Helper chunking and combiner scripts for BertSum-Abs and Pointer Generator methods
50 | 
51 | # Availability of Trained Models
52 | 
53 | The following models trained on the IN-Abs & UK-Abs datasets are publicly available at https://zenodo.org/record/7234359#.Y2tShdJByC1
54 | 
55 | - Extractive : SummaRuNNer, Gist
56 | 
57 | - Abstractive : Legal-Pegasus, Legal-LED
58 | 
59 | <!-- ## References
60 | 
61 | - [Enhancing Unsupervised Sentence Similarity Methods with Deep Contextualised Word Representations](https://acl-bg.org/proceedings/2019/RANLP%202019/pdf/RANLP115.pdf), RANLP 2019
62 | - [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084), EMNLP 2019 --> 
63 | 
64 | # Citation
65 | If you are using the implementations / datasets / trained models, please refer to the following papers:
66 | ```
67 | @inproceedings{shukla2022,
68 |   title={Legal Case Document Summarization: Extractive and Abstractive Methods and their Evaluation},
69 |   author={Shukla, Abhay and Bhattacharya, Paheli and Poddar, Soham and Mukherjee, Rajdeep and Ghosh, Kripabandhu and Goyal, Pawan and Ghosh, Saptarshi},
70 |   booktitle={The 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing},
71 |   year={2022}
72 | }
73 | 
74 | @inproceedings{bhattacharya2019comparative,
75 |   title={A comparative study of summarization algorithms applied to legal case judgments},
76 |   author={Bhattacharya, Paheli and Hiware, Kaustubh and Rajgaria, Subham and Pochhi, Nilay and Ghosh, Kripabandhu and Ghosh, Saptarshi},
77 |   booktitle={European Conference on Information Retrieval},
78 |   pages={413--428},
79 |   year={2019},
80 |   organization={Springer}
81 | }
82 | ```
83 | 
84 | 


--------------------------------------------------------------------------------
/abstractive/BART_based_approaches/Readme.md:
--------------------------------------------------------------------------------
 1 | # Scripts
 2 | 
 3 | BART_fineTune.ipynb - Script for fine-tuning BART model
 4 | 
 5 | generate_summaries_chunking_BART.ipynb - Script to generate summaries using chunking based BART method
 6 | 
 7 | generate_summaries_chunking_BART_RR.ipynb - Script to generate summaries using chunking based BART_RR method
 8 | 
 9 | generate_summaries_chunking_BERT_BART.ipynb - Script to generate summaries using chunking based BERT_BART method
10 | 
11 | # Requirements
12 | 
13 | transformers  4.12.3
14 | 
15 | pytorch  1.10
16 | 
17 | pytorch lightning  1.5.1
18 | 
19 | # Usage
20 | 
21 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py
22 | 2. Follow the instructions given in the notebooks
23 | 
24 | Note that rhetorical role-based segmenting requires rhetorical role labeled documents. Each line in the document would contain a sentence, \t, and the label (sentence_1\tlabel). The path to the directory of labeled documents should be updated in the utility.py.
25 | 
26 | Rhetorical role labeling - https://link.springer.com/article/10.1007/s10506-021-09304-5 
27 | 


--------------------------------------------------------------------------------
/abstractive/BART_based_approaches/generate_summaries_chunking_BART.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","id":"cea86415","metadata":{"id":"cea86415"},"source":["### Script to generate summaries using chunking based BART method\n","\n","Assign the dataset and output_path variable according to requirements.  \n"]},{"cell_type":"code","execution_count":null,"id":"b13ccd09","metadata":{"id":"b13ccd09"},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext \n","output_path = \"./output/\""]},{"cell_type":"code","execution_count":null,"id":"ad7a3c99","metadata":{"id":"ad7a3c99"},"outputs":[],"source":["import sys\n","from BART_utilities import *\n","sys.path.insert(0, '../')\n","from utilities import *\n","\n","import transformers\n","import pandas as pd\n","import numpy as np\n","import glob\n","import nltk\n","import torch\n","import math\n","import random\n","import re\n","import argparse\n","import os\n"]},{"cell_type":"code","execution_count":null,"id":"383b92c3","metadata":{"id":"383b92c3"},"outputs":[],"source":["#Reading the test documents\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))\n","dict_names = get_req_len_dict(dataset, \"test\") "]},{"cell_type":"code","execution_count":null,"id":"ea21bb4b","metadata":{"id":"ea21bb4b"},"outputs":[],"source":["# Loading Model and tokenizer\n","from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig\n","\n","\n","tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)\n","\n","model = BartForConditionalGeneration.from_pretrained(\"facebook/bart-large\")"]},{"cell_type":"markdown","id":"67b07aa2","metadata":{"id":"67b07aa2"},"source":["### For using fine tuned model \n","1. uncomment the 2nd line in the following cell\n","2. add the path to the fine tuned model"]},{"cell_type":"code","execution_count":null,"id":"98713de3","metadata":{"id":"98713de3"},"outputs":[],"source":["bart_model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = model)\n","\n","# bart_model = LitModel.load_from_checkpoint(\"/home/pahelibhattacharya/HULK/Abhay/models/BART_large_IN_MCS.ckpt\",\n","#                                       learning_rate = 2e-5, tokenizer = tokenizer, model = model).to(\"cuda\")"]},{"cell_type":"code","execution_count":null,"id":"39eae844","metadata":{"id":"39eae844"},"outputs":[],"source":["def generate_summary_gpu(nested_sentences,p=0.2):\n","  '''\n","    Function to generate summaries from the list containing chunks of the document\n","    input:  nested_sentences - chunks\n","            p - Number of words in summaries per word in the document\n","    output: document summary\n","    '''\n","  device = 'cuda'\n","  summaries = []\n","  for nested in nested_sentences:\n","    l = int(p * len(nested.split(\" \")))\n","    input_tokenized = tokenizer.encode(nested, truncation=True, return_tensors='pt')\n","    input_tokenized = input_tokenized.to(device)\n","    summary_ids = bart_model.model.to(device).generate(input_tokenized,\n","                                      length_penalty=0.01,\n","                                      min_length=l-5,\n","                                      max_length=l+5)\n","    output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]\n","    summaries.append(output)\n","  summaries = [sentence for sublist in summaries for sentence in sublist]\n","  return summaries"]},{"cell_type":"code","execution_count":null,"id":"f473093c","metadata":{"id":"f473093c"},"outputs":[],"source":["import os\n","if not os.path.exists(output_path):\n","    os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"2bdf4344","metadata":{"id":"2bdf4344"},"outputs":[],"source":["# main loop to generate and save summaries of each document in the test dataset\n","for i in range(len(data_source)):\n","    name = names[i]\n","    doc = data_source[i]\n","    wc = doc.split(\" \")\n","    input_len = len(wc)\n","    req_len = dict_names[name]\n","    print(str(i) + \": \" + name +  \" - \" + str(input_len) + \" : \" + str(req_len), end = \" ,\")\n","    \n","    nested = nest_sentences(doc,1024)\n","    l = int(req_len/len(nested))\n","    p = float(req_len/input_len)\n","    print(p)\n","    \n","    abs_summ = generate_summary_gpu(nested,p)\n","#     print(abs_summ)\n","#     break\n","    abs_summ = \" \".join(abs_summ)\n","    if len(abs_summ.split(\" \")) > req_len:\n","        abs_summ = abs_summ.split(\" \")\n","        abs_summ = abs_summ[:req_len]\n","        abs_summ = \" \".join(abs_summ)\n","#     print(abs_summ)\n","#     break\n","    print(len((abs_summ.split(\" \"))))\n","    \n","    path = output_path + name\n","    file = open(path,'w')\n","    file.write(abs_summ)\n","    file.close()\n","    \n"]}],"metadata":{"colab":{"collapsed_sections":[],"name":"generate_summaries_chunking_BART.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5}
2 | 


--------------------------------------------------------------------------------
/abstractive/BART_based_approaches/generate_summaries_chunking_BART_RR.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","id":"646853af","metadata":{"id":"646853af"},"source":["### Script to generate summaries using chunking based BART RR method\n","\n","Assign the dataset and output_path variable according to requirements.  "]},{"cell_type":"code","execution_count":null,"id":"3fd5817d","metadata":{"id":"3fd5817d"},"outputs":[],"source":["dataset = \"N2\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext \n","output_path = \"./output/\""]},{"cell_type":"code","execution_count":null,"id":"ad7a3c99","metadata":{"id":"ad7a3c99"},"outputs":[],"source":["import sys\n","from BART_utilities import *\n","sys.path.insert(0, '../')\n","import transformers\n","import pandas as pd\n","import numpy as np\n","import glob\n","import nltk\n","import torch\n","import math\n","import random\n","import re\n","import argparse\n","import os\n","from utilities import *"]},{"cell_type":"code","execution_count":null,"id":"383b92c3","metadata":{"id":"383b92c3"},"outputs":[],"source":["#Reading the test documents\n","names, data_source = get_summary_data_rhet_test(dataset)\n","print(len(names))\n","print(len(data_source))\n","dict_names = get_req_len_dict(dataset, \"test\") "]},{"cell_type":"code","execution_count":null,"id":"ea21bb4b","metadata":{"id":"ea21bb4b"},"outputs":[],"source":["# Loading Model and tokenizer\n","from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig\n","\n","\n","tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)\n","\n","model = BartForConditionalGeneration.from_pretrained(\"facebook/bart-large\")\n","\n","new_tokens = ['<F>', '<RLC>', '<A>', '<S>', '<P>', '<R>', '<RPC>']\n","# tokenizer.add_special_tokens(new_tokens)\n","\n","special_tokens_dict = {'additional_special_tokens': new_tokens}\n","num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)\n","model.resize_token_embeddings(len(tokenizer))"]},{"cell_type":"markdown","id":"b3aa4218","metadata":{"id":"b3aa4218"},"source":["#### Add the path to fine tuned model"]},{"cell_type":"code","execution_count":null,"id":"865f3020","metadata":{"id":"865f3020"},"outputs":[],"source":["bart_model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = model)\n","# bart_model = LitModel.load_from_checkpoint(\"path to model\",learning_rate = 2e-5, tokenizer = tokenizer, model = model).to(\"cuda\")"]},{"cell_type":"code","execution_count":null,"id":"76586532","metadata":{"id":"76586532"},"outputs":[],"source":["def nest_sentencesV3(doc, chunk_length):\n","    '''\n","    function to first segment the document using rhetorical roles and then chunk if required\n","    input:  doc_sents           - Input document sentence\n","            chunk_length        - chunk length\n","    output: list of chunks\n","    '''\n","    doc_sents, _, dict_sents_labels = get_doc_sens_and_labels(doc)\n","    s = list(set(dict_sents_labels.values()))\n","#     print(s)\n","    all_chunks = []\n","    \n","    for label in s:\n","        doc_sents_withlabels = []\n","        for sent in doc_sents:\n","            if sent == '':continue\n","            if dict_sents_labels[sent] == label:\n","                doc_sents_withlabels.append(sent)\n","        chunks = nest_sentencesMV2(doc_sents_withlabels, chunk_length)\n","        \n","        edited_chunks = []\n","        for chunk in chunks:\n","            edited_chunks.append([\"<\" + label + \">\"] + chunk)\n","        #modified\n","        \n","        all_chunks = all_chunks + ['. '.join(i) for i in edited_chunks]\n","\n","    return all_chunks    \n"]},{"cell_type":"code","execution_count":null,"id":"39eae844","metadata":{"id":"39eae844"},"outputs":[],"source":["def generate_summary_gpu(nested_sentences,p):\n","    '''\n","    Function to generate summaries from the list containing chunks of the document\n","    input:  nested_sentences - chunks\n","            p - Number of words in summaries per word in the document\n","    output: document summary\n","    '''\n","    device = 'cuda'\n","    summaries = []\n","    for nested in nested_sentences:\n","        l = int(p * len(nested.split(\" \")))\n","        input_tokenized = tokenizer.encode(nested, truncation=True, return_tensors='pt')\n","        input_tokenized = input_tokenized.to(device)\n","        summary_ids = bart_model.model.to(device).generate(input_tokenized,\n","                                          length_penalty=0.05,\n","                                          min_length=l-5,\n","                                          max_length=l+5)\n","\n","        output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]\n","        summaries.append(output)\n","    summaries = [sentence for sublist in summaries for sentence in sublist]\n","    return summaries"]},{"cell_type":"code","execution_count":null,"id":"85033199","metadata":{"id":"85033199"},"outputs":[],"source":["import os\n","if not os.path.exists(output_path):\n","    os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"2bdf4344","metadata":{"id":"2bdf4344"},"outputs":[],"source":["# main loop to generate and save summaries of each document in the test dataset\n","output = []\n","for i in range(len(data_source)):\n","    name = names[i]\n","    doc = data_source[i]\n","    wc = doc.split(\" \")\n","    input_len = len(wc)\n","    req_len = dict_names[name]\n","    print(str(i) + \": \" + name +  \" - \" + str(input_len) + \" : \" + str(req_len), end = \", \")\n","    \n","    nested = nest_sentencesV3(doc,1024)\n","    p = float(req_len/input_len)\n","    print(p)\n","    abs_summ = generate_summary_gpu(nested,p)\n","    abs_summ = \" \".join(abs_summ)\n","    print(len((abs_summ.split(\" \"))))\n","    \n","    if len(abs_summ.split(\" \")) > req_len:\n","        abs_summ = abs_summ.split(\" \")\n","        abs_summ = abs_summ[:req_len]\n","        abs_summ = \" \".join(abs_summ)\n","#     print(abs_summ)\n","#     break\n","    print(len((abs_summ.split(\" \"))))\n","    path = output_path + name\n","    file = open(path,'w')\n","    file.write(abs_summ)\n","    file.close()\n","    \n","print(output)"]}],"metadata":{"colab":{"collapsed_sections":[],"name":"generate_summaries_chunking_BART_RR.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5}
2 | 


--------------------------------------------------------------------------------
/abstractive/BART_based_approaches/generate_summaries_chunking_BERT_BART.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","id":"8793d0c1","metadata":{"id":"8793d0c1"},"source":["### Script to generate summaries using chunking based BERT_BART method\n","\n","Assign the dataset and ouput_folder variable according to requirements.  \n"]},{"cell_type":"code","execution_count":null,"id":"b94e8ba9","metadata":{"id":"b94e8ba9"},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext \n","output_path = \"./IN_BERT_BART/\""]},{"cell_type":"code","execution_count":null,"id":"ad7a3c99","metadata":{"id":"ad7a3c99"},"outputs":[],"source":["import sys\n","from BART_utilities import *\n","sys.path.insert(0, '../')\n","import transformers\n","import pandas as pd\n","import numpy as np\n","import glob\n","import nltk\n","import torch\n","from utilities import *\n","import math\n","import random\n","import re\n","import argparse\n","import os"]},{"cell_type":"code","execution_count":null,"id":"383b92c3","metadata":{"id":"383b92c3"},"outputs":[],"source":["#Reading the test documents\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))"]},{"cell_type":"code","execution_count":null,"id":"41e0ecab","metadata":{"id":"41e0ecab"},"outputs":[],"source":["dict_names = get_req_len_dict(dataset, \"test\")   "]},{"cell_type":"code","execution_count":null,"id":"ea21bb4b","metadata":{"id":"ea21bb4b"},"outputs":[],"source":["# Loading Model and tokenizer\n","from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig\n","\n","tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)\n","\n","model = BartForConditionalGeneration.from_pretrained(\"facebook/bart-large\")"]},{"cell_type":"markdown","id":"b7902d06","metadata":{"id":"b7902d06"},"source":["### For using fine tuned model \n","1. uncomment the 2nd line\n","2. add the path to the fine tuned model"]},{"cell_type":"code","execution_count":null,"id":"37717f34","metadata":{"id":"37717f34"},"outputs":[],"source":["bart_model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = model)\n","\n","# bart_model = LitModel.load_from_checkpoint(\"/home/pahelibhattacharya/HULK/Abhay/models/BART_large_IN_MCS.ckpt\",\n","#                                       learning_rate = 2e-5, tokenizer = tokenizer, model = model).to(\"cuda\")"]},{"cell_type":"code","execution_count":null,"id":"39eae844","metadata":{"id":"39eae844"},"outputs":[],"source":["def generate_summary_gpu(nested_sentences,p=0.2):\n","  '''\n","    Function to generate summaries from the list containing chunks of the document\n","    input:  nested_sentences - chunks\n","            p - Number of words in summaries per word in the document\n","    output: document summary\n","  '''\n","  device = 'cuda'\n","  summaries = []\n","  for nested in nested_sentences:\n","    l = int(p * len(nested.split(\" \")))\n","#     print(l)\n","    input_tokenized = tokenizer.encode(nested, truncation=True, return_tensors='pt')\n","    input_tokenized = input_tokenized.to(device)\n","    summary_ids = bart_model.model.to('cuda').generate(input_tokenized,\n","                                      length_penalty=0.01,\n","                                      min_length=l-5,\n","                                      max_length=l+5)\n","    output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]\n","    summaries.append(output)\n","  summaries = [sentence for sublist in summaries for sentence in sublist]\n","  return summaries"]},{"cell_type":"code","execution_count":null,"id":"69571a9f","metadata":{"id":"69571a9f"},"outputs":[],"source":["from summarizer import Summarizer\n","model = Summarizer()"]},{"cell_type":"code","execution_count":null,"id":"93b0d593","metadata":{"id":"93b0d593"},"outputs":[],"source":["def summ_doc(doc, req_len):\n","    '''\n","    function to generate summaries for a document\n","    input:  doc - document\n","            req_len - Required summary length\n","    output: summary generated using BERT_BART method\n","    '''\n","    doc_len = len(doc.split(\" \"))\n","    r = (5*1024)/doc_len\n","    if r < 1 and req_len < 5*1024:\n","        ext_result = model(doc, ratio=r)\n","    else:\n","        ext_result = doc\n","    nested = nest_sentences(ext_result,1024)\n","    p = float(req_len/len(ext_result.split(\" \")))\n","    abs_summ = generate_summary_gpu(nested,p)\n","    \n","    summ = \" \".join(abs_summ)\n","    return summ"]},{"cell_type":"code","execution_count":null,"id":"1841dc0c","metadata":{"id":"1841dc0c"},"outputs":[],"source":["import os\n","if not os.path.exists(output_path):\n","    os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"2bdf4344","metadata":{"id":"2bdf4344"},"outputs":[],"source":["# main loop to generate and save summaries of each document in the test dataset\n","import pandas as pd\n","output = []\n","done = glob.glob(output_path)\n","done = [i[i.rfind(\"/\")+1:] for i in done]\n","print(done)\n","\n","for i in range(len(data_source)):\n","    name = names[i]\n","    if name in done:continue\n","    doc = data_source[i]\n","    wc = doc.split(\" \")\n","    input_len = len(wc)\n","    req_len = dict_names[name]\n","    print(str(i) + \": \" + name +  \" - \" + str(input_len) + \" : \" + str(req_len))\n","    \n","    abs_summ = summ_doc(doc, req_len)\n","    path = output_path + name\n","    file = open(path,'w')\n","    file.write(abs_summ)\n","    file.close()\n","    \n","print(output)"]}],"metadata":{"colab":{"collapsed_sections":[],"name":"generate_summaries_chunking_BERT_BART.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5}
2 | 


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Yang Liu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/bert_data/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/logs/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/raw_data/temp.raw_src:
--------------------------------------------------------------------------------
1 | this Terry Jones had a love of the absurd that contributed much to the anarchic humour of Monty Python's Flying Circus. His style of visual comedy, leavened with a touch of the surreal, inspired many comedians who followed him. It was on Python that he honed his directing skills, notably on Life of Brian and The Meaning of Life. A keen historian, he wrote a number of books and fronted TV documentaries on ancient and medieval history. Terence Graham Parry Jones was born in Colwyn Bay in north Wales on 1 February 1942. His grandparents ran the local amateur operatic society and staged Gilbert and Sullivan concerts on the town's pier each year His family moved to Surrey when he was four but he always felt nostalgic about his native land. "I couldn't bear it and for the longest time I wanted Wales back," he once said. "I still feel very Welsh and feel it's where I should be really." After leaving the Royal Grammar School in Guildford, where he captained the school, he went on to read English at St Edmund Hall, Oxford. However, as he put it, he "strayed into history", the subject in which he graduated. While at Oxford he wrote sketches for the Oxford Revue and performed alongside a fellow student, Michael Palin.
2 | (CNN) An Iranian chess referee says she is frightened to return home after she was criticized online for not wearing the appropriate headscarf during an international tournament. Currently the chief adjudicator at the Women's World Chess Championship held in Russia and China, Shohreh Bayat says she fears arrest after a photograph of her was taken during the event and was then circulated online in Iran. "They are very sensitive about the hijab when we are representing Iran in international events and even sometimes they send a person with the team to control our hijab," Bayat told CNN Sport in a phone interview Tuesday. The headscarf, or the hijab, has been a mandatory part of women's dress in Iran since the 1979 Islamic revolution but, in recent years, some women have mounted opposition and staged protests about headwear rules. Bayat said she had been wearing a headscarf at the tournament but that certain camera angles had made it look like she was not. "If I come back to Iran, I think there are a few possibilities. It is highly possible that they arrest me [...] or it is possible that they invalidate my passport," added Bayat. "I think they want to make an example of me." The photographs were taken at the first stage of the chess championship in Shanghai, China, but Bayat has since flown to Vladivostok, Russia, for the second leg between Ju Wenjun and Aleksandra Goryachkina. She was left "panicked and shocked" when she became aware of the reaction in Iran after checking her phone in the hotel room. The 32-year-old said she felt helpless as websites reportedly condemned her for what some described as protesting the country's compulsory law. Subsequently, Bayat has decided to no longer wear the headscarf. "I'm not wearing it anymore because what is the point? I was just tolerating it, I don't believe in the hijab," she added. "People must be free to choose to wear what they want, and I was only wearing the hijab because I live in Iran and I had to wear it. I had no other choice." Bayat says she sought help from the country's chess federation. She says the federation told her to post an apology on her social media channels. She agreed under the condition that the federation would guarantee her safety but she said they refused. "My husband is in Iran, my parents are in Iran, all my family members are in Iran. I don't have anyone else outside of Iran. I don't know what to say, this is a very hard situation," she said. CNN contacted the Iranian Chess Federation on Tuesday but has yet to receive a response.


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/raw_data/temp.raw_tgt:
--------------------------------------------------------------------------------
1 | this Terry Jones had a love of the absurd that contributed much to the anarchic humour of Monty Python's Flying Circus.
2 | An Iranian chess referee says she is frightened to return home after she was criticized online for not wearing the appropriate headscarf during an international tournament.


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/raw_data/temp_ext.raw_src:
--------------------------------------------------------------------------------
1 | this Terry Jones had a love of the absurd that contributed much to the anarchic humour of Monty Python's Flying Circus. [CLS] [SEP] His style of visual comedy, leavened with a touch of the surreal, inspired many comedians who followed him. [CLS] [SEP] It was on Python that he honed his directing skills, notably on Life of Brian and The Meaning of Life. [CLS] [SEP] A keen historian, he wrote a number of books and fronted TV documentaries on ancient and medieval history. [CLS] [SEP] Terence Graham Parry Jones was born in Colwyn Bay in north Wales on 1 February 1942. [CLS] [SEP] His grandparents ran the local amateur operatic society and staged Gilbert and Sullivan concerts on the town's pier each year His family moved to Surrey when he was four but he always felt nostalgic about his native land. [CLS] [SEP] "I couldn't bear it and for the longest time I wanted Wales back," he once said. [CLS] [SEP] "I still feel very Welsh and feel it's where I should be really." After leaving the Royal Grammar School in Guildford, where he captained the school, he went on to read English at St Edmund Hall, Oxford. [CLS] [SEP] However, as he put it, he "strayed into history", the subject in which he graduated. [CLS] [SEP] While at Oxford he wrote sketches for the Oxford Revue and performed alongside a fellow student, Michael Palin. 
2 | (CNN) An Iranian chess referee says she is frightened to return home after she was criticized online for not wearing the appropriate headscarf during an international tournament. [CLS] [SEP] Currently the chief adjudicator at the Women's World Chess Championship held in Russia and China, Shohreh Bayat says she fears arrest after a photograph of her was taken during the event and was then circulated online in Iran. [CLS] [SEP] "They are very sensitive about the hijab when we are representing Iran in international events and even sometimes they send a person with the team to control our hijab," Bayat told CNN Sport in a phone interview Tuesday. [CLS] [SEP] The headscarf, or the hijab, has been a mandatory part of women's dress in Iran since the 1979 Islamic revolution but, in recent years, some women have mounted opposition and staged protests about headwear rules. [CLS] [SEP] Bayat said she had been wearing a headscarf at the tournament but that certain camera angles had made it look like she was not. [CLS] [SEP] "If I come back to Iran, I think there are a few possibilities. [CLS] [SEP] It is highly possible that they arrest me [. [CLS] [SEP]. [CLS] [SEP]. [CLS] [SEP]] or it is possible that they invalidate my passport," added Bayat. [CLS] [SEP] "I think they want to make an example of me. [CLS] [SEP]" The photographs were taken at the first stage of the chess championship in Shanghai, China, but Bayat has since flown to Vladivostok, Russia, for the second leg between Ju Wenjun and Aleksandra Goryachkina. [CLS] [SEP] She was left "panicked and shocked" when she became aware of the reaction in Iran after checking her phone in the hotel room. [CLS] [SEP] The 32-year-old said she felt helpless as websites reportedly condemned her for what some described as protesting the country's compulsory law. [CLS] [SEP] Subsequently, Bayat has decided to no longer wear the headscarf. [CLS] [SEP] "I'm not wearing it anymore because what is the point? I was just tolerating it, I don't believe in the hijab," she added. [CLS] [SEP] "People must be free to choose to wear what they want, and I was only wearing the hijab because I live in Iran and I had to wear it. [CLS] [SEP] I had no other choice. [CLS] [SEP]" Bayat says she sought help from the country's chess federation. [CLS] [SEP] She says the federation told her to post an apology on her social media channels. [CLS] [SEP] She agreed under the condition that the federation would guarantee her safety but she said they refused. [CLS] [SEP] "My husband is in Iran, my parents are in Iran, all my family members are in Iran. [CLS] [SEP] I don't have anyone else outside of Iran. [CLS] [SEP] I don't know what to say, this is a very hard situation," she said. [CLS] [SEP] CNN contacted the Iranian Chess Federation on Tuesday but has yet to receive a response.


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/requirements.txt:
--------------------------------------------------------------------------------
1 | multiprocess==0.70.9
2 | numpy==1.17.2
3 | pyrouge==0.1.3
4 | pytorch-transformers==1.2.0
5 | tensorboardX==1.9
6 | torch==1.1.0


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/results/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/cal_rouge.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | # from multiprocess import Pool as Pool2
  5 | from multiprocessing import Pool
  6 | 
  7 | import shutil
  8 | import sys
  9 | import codecs
 10 | 
 11 | # from onmt.utils.logging import init_logger, logger
 12 | from others import pyrouge
 13 | 
 14 | 
 15 | def process(data):
 16 |     candidates, references, pool_id = data
 17 |     cnt = len(candidates)
 18 |     current_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
 19 |     tmp_dir = "rouge-tmp-{}-{}".format(current_time,pool_id)
 20 |     if not os.path.isdir(tmp_dir):
 21 |         os.mkdir(tmp_dir)
 22 |         os.mkdir(tmp_dir + "/candidate")
 23 |         os.mkdir(tmp_dir + "/reference")
 24 |     try:
 25 | 
 26 |         for i in range(cnt):
 27 |             if len(references[i]) < 1:
 28 |                 continue
 29 |             with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w",
 30 |                       encoding="utf-8") as f:
 31 |                 f.write(candidates[i])
 32 |             with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w",
 33 |                       encoding="utf-8") as f:
 34 |                 f.write(references[i])
 35 |         r = pyrouge.Rouge155()
 36 |         r.model_dir = tmp_dir + "/reference/"
 37 |         r.system_dir = tmp_dir + "/candidate/"
 38 |         r.model_filename_pattern = 'ref.#ID#.txt'
 39 |         r.system_filename_pattern = r'cand.(\d+).txt'
 40 |         rouge_results = r.convert_and_evaluate()
 41 |         print(rouge_results)
 42 |         results_dict = r.output_to_dict(rouge_results)
 43 |     finally:
 44 |         pass
 45 |         if os.path.isdir(tmp_dir):
 46 |             shutil.rmtree(tmp_dir)
 47 |     return results_dict
 48 | 
 49 | 
 50 | 
 51 | 
 52 | def chunks(l, n):
 53 |     """Yield successive n-sized chunks from l."""
 54 |     for i in range(0, len(l), n):
 55 |         yield l[i:i + n]
 56 | 
 57 | def test_rouge(cand, ref,num_processes):
 58 |     """Calculate ROUGE scores of sequences passed as an iterator
 59 |        e.g. a list of str, an open file, StringIO or even sys.stdin
 60 |     """
 61 |     candidates = [line.strip() for line in cand]
 62 |     references = [line.strip() for line in ref]
 63 | 
 64 |     print(len(candidates))
 65 |     print(len(references))
 66 |     assert len(candidates) == len(references)
 67 |     candidates_chunks = list(chunks(candidates, int(len(candidates)/num_processes)))
 68 |     references_chunks = list(chunks(references, int(len(references)/num_processes)))
 69 |     n_pool = len(candidates_chunks)
 70 |     arg_lst = []
 71 |     for i in range(n_pool):
 72 |         arg_lst.append((candidates_chunks[i],references_chunks[i],i))
 73 |     pool = Pool(n_pool)
 74 |     results = pool.map(process,arg_lst)
 75 |     final_results = {}
 76 |     for i,r in enumerate(results):
 77 |         for k in r:
 78 |             if(k not in final_results):
 79 |                 final_results[k] = r[k]*len(candidates_chunks[i])
 80 |             else:
 81 |                 final_results[k] += r[k] * len(candidates_chunks[i])
 82 |     for k in final_results:
 83 |         final_results[k] = final_results[k]/len(candidates)
 84 |     return final_results
 85 | def rouge_results_to_str(results_dict):
 86 |     return ">> ROUGE-F(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\nROUGE-R(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\n".format(
 87 |         results_dict["rouge_1_f_score"] * 100,
 88 |         results_dict["rouge_2_f_score"] * 100,
 89 |         # results_dict["rouge_3_f_score"] * 100,
 90 |         results_dict["rouge_l_f_score"] * 100,
 91 |     results_dict["rouge_1_recall"] * 100,
 92 |     results_dict["rouge_2_recall"] * 100,
 93 |     # results_dict["rouge_3_f_score"] * 100,
 94 |     results_dict["rouge_l_recall"] * 100
 95 | 
 96 |     # ,results_dict["rouge_su*_f_score"] * 100
 97 |     )
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     # init_logger('test_rouge.log')
102 |     parser = argparse.ArgumentParser()
103 |     parser.add_argument('-c', type=str, default="candidate.txt",
104 |                         help='candidate file')
105 |     parser.add_argument('-r', type=str, default="reference.txt",
106 |                         help='reference file')
107 |     parser.add_argument('-p', type=int, default=1,
108 |                         help='number of processes')
109 |     args = parser.parse_args()
110 |     print(args.c)
111 |     print(args.r)
112 |     print(args.p)
113 |     if args.c.upper() == "STDIN":
114 |         candidates = sys.stdin
115 |     else:
116 |         candidates = codecs.open(args.c, encoding="utf-8")
117 |     references = codecs.open(args.r, encoding="utf-8")
118 | 
119 |     results_dict = test_rouge(candidates, references,args.p)
120 |     # return 0
121 |     print(time.strftime('%H:%M:%S', time.localtime())
122 | )
123 |     print(rouge_results_to_str(results_dict))
124 |     # logger.info(rouge_results_to_str(results_dict))


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/distributed.py:
--------------------------------------------------------------------------------
  1 | """ Pytorch Distributed utils
  2 |     This piece of code was heavily inspired by the equivalent of Fairseq-py
  3 |     https://github.com/pytorch/fairseq
  4 | """
  5 | 
  6 | 
  7 | from __future__ import print_function
  8 | 
  9 | import math
 10 | import pickle
 11 | 
 12 | import torch.distributed
 13 | 
 14 | from others.logging import logger
 15 | 
 16 | 
 17 | def is_master(gpu_ranks, device_id):
 18 |     return gpu_ranks[device_id] == 0
 19 | 
 20 | 
 21 | def multi_init(device_id, world_size,gpu_ranks):
 22 |     print(gpu_ranks)
 23 |     dist_init_method = 'tcp://localhost:10000'
 24 |     dist_world_size = world_size
 25 |     torch.distributed.init_process_group(
 26 |         backend='nccl', init_method=dist_init_method,
 27 |         world_size=dist_world_size, rank=gpu_ranks[device_id])
 28 |     gpu_rank = torch.distributed.get_rank()
 29 |     if not is_master(gpu_ranks, device_id):
 30 |     #     print('not master')
 31 |         logger.disabled = True
 32 | 
 33 |     return gpu_rank
 34 | 
 35 | 
 36 | 
 37 | def all_reduce_and_rescale_tensors(tensors, rescale_denom,
 38 |                                    buffer_size=10485760):
 39 |     """All-reduce and rescale tensors in chunks of the specified size.
 40 | 
 41 |     Args:
 42 |         tensors: list of Tensors to all-reduce
 43 |         rescale_denom: denominator for rescaling summed Tensors
 44 |         buffer_size: all-reduce chunk size in bytes
 45 |     """
 46 |     # buffer size in bytes, determine equiv. # of elements based on data type
 47 |     buffer_t = tensors[0].new(
 48 |         math.ceil(buffer_size / tensors[0].element_size())).zero_()
 49 |     buffer = []
 50 | 
 51 |     def all_reduce_buffer():
 52 |         # copy tensors into buffer_t
 53 |         offset = 0
 54 |         for t in buffer:
 55 |             numel = t.numel()
 56 |             buffer_t[offset:offset+numel].copy_(t.view(-1))
 57 |             offset += numel
 58 | 
 59 |         # all-reduce and rescale
 60 |         torch.distributed.all_reduce(buffer_t[:offset])
 61 |         buffer_t.div_(rescale_denom)
 62 | 
 63 |         # copy all-reduced buffer back into tensors
 64 |         offset = 0
 65 |         for t in buffer:
 66 |             numel = t.numel()
 67 |             t.view(-1).copy_(buffer_t[offset:offset+numel])
 68 |             offset += numel
 69 | 
 70 |     filled = 0
 71 |     for t in tensors:
 72 |         sz = t.numel() * t.element_size()
 73 |         if sz > buffer_size:
 74 |             # tensor is bigger than buffer, all-reduce and rescale directly
 75 |             torch.distributed.all_reduce(t)
 76 |             t.div_(rescale_denom)
 77 |         elif filled + sz > buffer_size:
 78 |             # buffer is full, all-reduce and replace buffer with grad
 79 |             all_reduce_buffer()
 80 |             buffer = [t]
 81 |             filled = sz
 82 |         else:
 83 |             # add tensor to buffer
 84 |             buffer.append(t)
 85 |             filled += sz
 86 | 
 87 |     if len(buffer) > 0:
 88 |         all_reduce_buffer()
 89 | 
 90 | 
 91 | def all_gather_list(data, max_size=4096):
 92 |     """Gathers arbitrary data from all nodes into a list."""
 93 |     world_size = torch.distributed.get_world_size()
 94 |     if not hasattr(all_gather_list, '_in_buffer') or \
 95 |             max_size != all_gather_list._in_buffer.size():
 96 |         all_gather_list._in_buffer = torch.cuda.ByteTensor(max_size)
 97 |         all_gather_list._out_buffers = [
 98 |             torch.cuda.ByteTensor(max_size)
 99 |             for i in range(world_size)
100 |         ]
101 |     in_buffer = all_gather_list._in_buffer
102 |     out_buffers = all_gather_list._out_buffers
103 | 
104 |     enc = pickle.dumps(data)
105 |     enc_size = len(enc)
106 |     if enc_size + 2 > max_size:
107 |         raise ValueError(
108 |             'encoded data exceeds max_size: {}'.format(enc_size + 2))
109 |     assert max_size < 255*256
110 |     in_buffer[0] = enc_size // 255  # this encoding works for max_size < 65k
111 |     in_buffer[1] = enc_size % 255
112 |     in_buffer[2:enc_size+2] = torch.ByteTensor(list(enc))
113 | 
114 |     torch.distributed.all_gather(out_buffers, in_buffer.cuda())
115 | 
116 |     results = []
117 |     for i in range(world_size):
118 |         out_buffer = out_buffers[i]
119 |         size = (255 * out_buffer[0].item()) + out_buffer[1].item()
120 | 
121 |         bytes_list = bytes(out_buffer[2:size+2].tolist())
122 |         result = pickle.loads(bytes_list)
123 |         results.append(result)
124 |     return results
125 | 


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Law-AI/summarization/77edff66005cca4c897e608db6d47509c9e8d029/abstractive/BertSum-Abs/PreSumm/src/models/__init__.py


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/models/adam.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch.optim.optimizer import Optimizer
  4 | 
  5 | 
  6 | class Adam(Optimizer):
  7 |     r"""Implements Adam algorithm.
  8 | 
  9 |     It has been proposed in `Adam: A Method for Stochastic Optimization`_.
 10 | 
 11 |     Arguments:
 12 |         params (iterable): iterable of parameters to optimize or dicts defining
 13 |             parameter groups
 14 |         lr (float, optional): learning rate (default: 1e-3)
 15 |         betas (Tuple[float, float], optional): coefficients used for computing
 16 |             running averages of gradient and its square (default: (0.9, 0.999))
 17 |         eps (float, optional): term added to the denominator to improve
 18 |             numerical stability (default: 1e-8)
 19 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 20 |         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
 21 |             algorithm from the paper `On the Convergence of Adam and Beyond`_
 22 |             (default: False)
 23 | 
 24 |     .. _Adam\: A Method for Stochastic Optimization:
 25 |         https://arxiv.org/abs/1412.6980
 26 |     .. _On the Convergence of Adam and Beyond:
 27 |         https://openreview.net/forum?id=ryQu7f-RZ
 28 |     """
 29 | 
 30 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
 31 |                  weight_decay=0, amsgrad=False):
 32 |         if not 0.0 <= lr:
 33 |             raise ValueError("Invalid learning rate: {}".format(lr))
 34 |         if not 0.0 <= eps:
 35 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 36 |         if not 0.0 <= betas[0] < 1.0:
 37 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 38 |         if not 0.0 <= betas[1] < 1.0:
 39 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 40 |         defaults = dict(lr=lr, betas=betas, eps=eps,
 41 |                         weight_decay=weight_decay, amsgrad=amsgrad)
 42 |         super(Adam, self).__init__(params, defaults)
 43 | 
 44 |     def __setstate__(self, state):
 45 |         super(Adam, self).__setstate__(state)
 46 |         for group in self.param_groups:
 47 |             group.setdefault('amsgrad', False)
 48 | 
 49 |     def step(self, closure=None):
 50 |         """Performs a single optimization step.
 51 |         Arguments:
 52 |             closure (callable, optional): A closure that reevaluates the model
 53 |                 and returns the loss.
 54 |         """
 55 |         loss = None
 56 |         if closure is not None:
 57 |             loss = closure()
 58 | 
 59 | 
 60 |         for group in self.param_groups:
 61 |             for p in group['params']:
 62 |                 if p.grad is None:
 63 |                     continue
 64 |                 grad = p.grad.data
 65 |                 if grad.is_sparse:
 66 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
 67 | 
 68 |                 state = self.state[p]
 69 | 
 70 |                 # State initialization
 71 |                 if len(state) == 0:
 72 |                     state['step'] = 0
 73 |                     # Exponential moving average of gradient values
 74 |                     state['next_m'] = torch.zeros_like(p.data)
 75 |                     # Exponential moving average of squared gradient values
 76 |                     state['next_v'] = torch.zeros_like(p.data)
 77 | 
 78 |                 next_m, next_v = state['next_m'], state['next_v']
 79 |                 beta1, beta2 = group['betas']
 80 | 
 81 |                 # Decay the first and second moment running average coefficient
 82 |                 # In-place operations to update the averages at the same time
 83 |                 next_m.mul_(beta1).add_(1 - beta1, grad)
 84 |                 next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 85 |                 update = next_m / (next_v.sqrt() + group['eps'])
 86 | 
 87 |                 # Just adding the square of the weights to the loss function is *not*
 88 |                 # the correct way of using L2 regularization/weight decay with Adam,
 89 |                 # since that will interact with the m and v parameters in strange ways.
 90 |                 #
 91 |                 # Instead we want to decay the weights in a manner that doesn't interact
 92 |                 # with the m/v parameters. This is equivalent to adding the square
 93 |                 # of the weights to the loss with plain (non-momentum) SGD.
 94 |                 if group['weight_decay'] > 0.0:
 95 |                     update += group['weight_decay'] * p.data
 96 | 
 97 |                 lr_scheduled = group['lr']
 98 | 
 99 |                 update_with_lr = lr_scheduled * update
100 |                 p.data.add_(-update_with_lr)
101 | 
102 |                 state['step'] += 1
103 | 
104 |                 # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
105 |                 # No bias correction
106 |                 # bias_correction1 = 1 - beta1 ** state['step']
107 |                 # bias_correction2 = 1 - beta2 ** state['step']
108 | 
109 |         return loss


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/models/encoder.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from models.neural import MultiHeadedAttention, PositionwiseFeedForward
  7 | 
  8 | 
  9 | class Classifier(nn.Module):
 10 |     def __init__(self, hidden_size):
 11 |         super(Classifier, self).__init__()
 12 |         self.linear1 = nn.Linear(hidden_size, 1)
 13 |         self.sigmoid = nn.Sigmoid()
 14 | 
 15 |     def forward(self, x, mask_cls):
 16 |         h = self.linear1(x).squeeze(-1)
 17 |         sent_scores = self.sigmoid(h) * mask_cls.float()
 18 |         return sent_scores
 19 | 
 20 | 
 21 | class PositionalEncoding(nn.Module):
 22 | 
 23 |     def __init__(self, dropout, dim, max_len=5000):
 24 |         pe = torch.zeros(max_len, dim)
 25 |         position = torch.arange(0, max_len).unsqueeze(1)
 26 |         div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) *
 27 |                               -(math.log(10000.0) / dim)))
 28 |         pe[:, 0::2] = torch.sin(position.float() * div_term)
 29 |         pe[:, 1::2] = torch.cos(position.float() * div_term)
 30 |         pe = pe.unsqueeze(0)
 31 |         super(PositionalEncoding, self).__init__()
 32 |         self.register_buffer('pe', pe)
 33 |         self.dropout = nn.Dropout(p=dropout)
 34 |         self.dim = dim
 35 | 
 36 |     def forward(self, emb, step=None):
 37 |         emb = emb * math.sqrt(self.dim)
 38 |         if (step):
 39 |             emb = emb + self.pe[:, step][:, None, :]
 40 | 
 41 |         else:
 42 |             emb = emb + self.pe[:, :emb.size(1)]
 43 |         emb = self.dropout(emb)
 44 |         return emb
 45 | 
 46 |     def get_emb(self, emb):
 47 |         return self.pe[:, :emb.size(1)]
 48 | 
 49 | 
 50 | class TransformerEncoderLayer(nn.Module):
 51 |     def __init__(self, d_model, heads, d_ff, dropout):
 52 |         super(TransformerEncoderLayer, self).__init__()
 53 | 
 54 |         self.self_attn = MultiHeadedAttention(
 55 |             heads, d_model, dropout=dropout)
 56 |         self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
 57 |         self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
 58 |         self.dropout = nn.Dropout(dropout)
 59 | 
 60 |     def forward(self, iter, query, inputs, mask):
 61 |         if (iter != 0):
 62 |             input_norm = self.layer_norm(inputs)
 63 |         else:
 64 |             input_norm = inputs
 65 | 
 66 |         mask = mask.unsqueeze(1)
 67 |         context = self.self_attn(input_norm, input_norm, input_norm,
 68 |                                  mask=mask)
 69 |         out = self.dropout(context) + inputs
 70 |         return self.feed_forward(out)
 71 | 
 72 | 
 73 | class ExtTransformerEncoder(nn.Module):
 74 |     def __init__(self, d_model, d_ff, heads, dropout, num_inter_layers=0):
 75 |         super(ExtTransformerEncoder, self).__init__()
 76 |         self.d_model = d_model
 77 |         self.num_inter_layers = num_inter_layers
 78 |         self.pos_emb = PositionalEncoding(dropout, d_model)
 79 |         self.transformer_inter = nn.ModuleList(
 80 |             [TransformerEncoderLayer(d_model, heads, d_ff, dropout)
 81 |              for _ in range(num_inter_layers)])
 82 |         self.dropout = nn.Dropout(dropout)
 83 |         self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
 84 |         self.wo = nn.Linear(d_model, 1, bias=True)
 85 |         self.sigmoid = nn.Sigmoid()
 86 | 
 87 |     def forward(self, top_vecs, mask):
 88 |         """ See :obj:`EncoderBase.forward()`"""
 89 | 
 90 |         batch_size, n_sents = top_vecs.size(0), top_vecs.size(1)
 91 |         pos_emb = self.pos_emb.pe[:, :n_sents]
 92 |         x = top_vecs * mask[:, :, None].float()
 93 |         x = x + pos_emb
 94 | 
 95 |         for i in range(self.num_inter_layers):
 96 |             x = self.transformer_inter[i](i, x, x, 1 - mask)  # all_sents * max_tokens * dim
 97 | 
 98 |         x = self.layer_norm(x)
 99 |         sent_scores = self.sigmoid(self.wo(x))
100 |         sent_scores = sent_scores.squeeze(-1) * mask.float()
101 | 
102 |         return sent_scores
103 | 
104 | 


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/others/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Law-AI/summarization/77edff66005cca4c897e608db6d47509c9e8d029/abstractive/BertSum-Abs/PreSumm/src/others/__init__.py


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/others/logging.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | 
 4 | import logging
 5 | 
 6 | logger = logging.getLogger()
 7 | 
 8 | 
 9 | def init_logger(log_file=None, log_file_level=logging.NOTSET):
10 |     log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s")
11 |     logger = logging.getLogger()
12 |     logger.setLevel(logging.INFO)
13 | 
14 |     console_handler = logging.StreamHandler()
15 |     console_handler.setFormatter(log_format)
16 |     logger.handlers = [console_handler]
17 | 
18 |     if log_file and log_file != '':
19 |         file_handler = logging.FileHandler(log_file)
20 |         file_handler.setLevel(log_file_level)
21 |         file_handler.setFormatter(log_format)
22 |         logger.addHandler(file_handler)
23 | 
24 |     return logger
25 | 


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/others/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import shutil
  4 | import time
  5 | 
  6 | from others import pyrouge
  7 | 
  8 | REMAP = {"-lrb-": "(", "-rrb-": ")", "-lcb-": "{", "-rcb-": "}",
  9 |          "-lsb-": "[", "-rsb-": "]", "``": '"', "''": '"'}
 10 | 
 11 | 
 12 | def clean(x):
 13 |     return re.sub(
 14 |         r"-lrb-|-rrb-|-lcb-|-rcb-|-lsb-|-rsb-|``|''",
 15 |         lambda m: REMAP.get(m.group()), x)
 16 | 
 17 | 
 18 | def process(params):
 19 |     temp_dir, data = params
 20 |     candidates, references, pool_id = data
 21 |     cnt = len(candidates)
 22 |     current_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
 23 |     tmp_dir = os.path.join(temp_dir, "rouge-tmp-{}-{}".format(current_time, pool_id))
 24 |     if not os.path.isdir(tmp_dir):
 25 |         os.mkdir(tmp_dir)
 26 |         os.mkdir(tmp_dir + "/candidate")
 27 |         os.mkdir(tmp_dir + "/reference")
 28 |     try:
 29 | 
 30 |         for i in range(cnt):
 31 |             if len(references[i]) < 1:
 32 |                 continue
 33 |             with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w",
 34 |                       encoding="utf-8") as f:
 35 |                 f.write(candidates[i])
 36 |             with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w",
 37 |                       encoding="utf-8") as f:
 38 |                 f.write(references[i])
 39 |         r = pyrouge.Rouge155(temp_dir=temp_dir)
 40 |         r.model_dir = tmp_dir + "/reference/"
 41 |         r.system_dir = tmp_dir + "/candidate/"
 42 |         r.model_filename_pattern = 'ref.#ID#.txt'
 43 |         r.system_filename_pattern = r'cand.(\d+).txt'
 44 |         rouge_results = r.convert_and_evaluate()
 45 |         print(rouge_results)
 46 |         results_dict = r.output_to_dict(rouge_results)
 47 |     finally:
 48 |         pass
 49 |         if os.path.isdir(tmp_dir):
 50 |             shutil.rmtree(tmp_dir)
 51 |     return results_dict
 52 | 
 53 | 
 54 | def test_rouge(temp_dir, cand, ref):
 55 |     candidates = [line.strip() for line in open(cand, encoding='utf-8')]
 56 |     references = [line.strip() for line in open(ref, encoding='utf-8')]
 57 |     print(len(candidates))
 58 |     print(len(references))
 59 |     assert len(candidates) == len(references)
 60 | 
 61 |     cnt = len(candidates)
 62 |     current_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
 63 |     tmp_dir = os.path.join(temp_dir, "rouge-tmp-{}".format(current_time))
 64 |     if not os.path.isdir(tmp_dir):
 65 |         os.mkdir(tmp_dir)
 66 |         os.mkdir(tmp_dir + "/candidate")
 67 |         os.mkdir(tmp_dir + "/reference")
 68 |     try:
 69 | 
 70 |         for i in range(cnt):
 71 |             if len(references[i]) < 1:
 72 |                 continue
 73 |             with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w",
 74 |                       encoding="utf-8") as f:
 75 |                 f.write(candidates[i])
 76 |             with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w",
 77 |                       encoding="utf-8") as f:
 78 |                 f.write(references[i])
 79 |         r = pyrouge.Rouge155(temp_dir=temp_dir)
 80 |         r.model_dir = tmp_dir + "/reference/"
 81 |         r.system_dir = tmp_dir + "/candidate/"
 82 |         r.model_filename_pattern = 'ref.#ID#.txt'
 83 |         r.system_filename_pattern = r'cand.(\d+).txt'
 84 |         rouge_results = r.convert_and_evaluate()
 85 |         print(rouge_results)
 86 |         results_dict = r.output_to_dict(rouge_results)
 87 |     finally:
 88 |         pass
 89 |         if os.path.isdir(tmp_dir):
 90 |             shutil.rmtree(tmp_dir)
 91 |     return results_dict
 92 | 
 93 | 
 94 | def tile(x, count, dim=0):
 95 |     """
 96 |     Tiles x on dimension dim count times.
 97 |     """
 98 |     perm = list(range(len(x.size())))
 99 |     if dim != 0:
100 |         perm[0], perm[dim] = perm[dim], perm[0]
101 |         x = x.permute(perm).contiguous()
102 |     out_size = list(x.size())
103 |     out_size[0] *= count
104 |     batch = x.size(0)
105 |     x = x.view(batch, -1) \
106 |          .transpose(0, 1) \
107 |          .repeat(count, 1) \
108 |          .transpose(0, 1) \
109 |          .contiguous() \
110 |          .view(*out_size)
111 |     if dim != 0:
112 |         x = x.permute(perm).contiguous()
113 |     return x
114 | 
115 | def rouge_results_to_str(results_dict):
116 |     return ">> ROUGE-F(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\nROUGE-R(1/2/3/l): {:.2f}/{:.2f}/{:.2f}\n".format(
117 |         results_dict["rouge_1_f_score"] * 100,
118 |         results_dict["rouge_2_f_score"] * 100,
119 |         # results_dict["rouge_3_f_score"] * 100,
120 |         results_dict["rouge_l_f_score"] * 100,
121 |         results_dict["rouge_1_recall"] * 100,
122 |         results_dict["rouge_2_recall"] * 100,
123 |         # results_dict["rouge_3_f_score"] * 100,
124 |         results_dict["rouge_l_recall"] * 100
125 | 
126 |         # ,results_dict["rouge_su*_f_score"] * 100
127 |     )
128 | 


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/post_stats.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from os import path
 3 | from functools import reduce
 4 | import re
 5 | 
 6 | def str2bool(v):
 7 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
 8 |         return True
 9 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
10 |         return False
11 |     else:
12 |         raise argparse.ArgumentTypeError('Boolean value expected.')
13 | 
14 | 
15 | 
16 | def n_grams(tokens, n):
17 |     l = len(tokens)
18 |     return [tuple(tokens[i:i + n]) for i in range(l) if i + n < l]
19 | 
20 | def has_repeat(elements):
21 |     d = set(elements)
22 |     return len(d) < len(elements)
23 | 
24 | def cal_self_repeat(summary):
25 |     ngram_repeats = {2: 0, 4: 0, 8: 0}
26 |     sents = summary.split('<q>')
27 |     for n in ngram_repeats.keys():
28 |         # Respect sentence boundary
29 |         grams = reduce(lambda x, y: x + y, [n_grams(sent.split(), n) for sent in sents], [])
30 |         ngram_repeats[n] += has_repeat(grams)
31 |     return ngram_repeats
32 | 
33 | def cal_novel(summary, gold, source, summary_ngram_novel, gold_ngram_novel):
34 |     summary = summary.replace('<q>',' ')
35 |     summary = re.sub(r' +', ' ', summary).strip()
36 |     gold = gold.replace('<q>',' ')
37 |     gold = re.sub(r' +', ' ', gold).strip()
38 |     source = source.replace(' ##','')
39 |     source = source.replace('[CLS]',' ').replace('[SEP]',' ').replace('[PAD]',' ')
40 |     source = re.sub(r' +', ' ', source).strip()
41 | 
42 | 
43 |     for n in summary_ngram_novel.keys():
44 |         summary_grams = set(n_grams(summary.split(), n))
45 |         gold_grams = set(n_grams(gold.split(), n))
46 |         source_grams = set(n_grams(source.split(), n))
47 |         joint = summary_grams.intersection(source_grams)
48 |         novel = summary_grams - joint
49 |         summary_ngram_novel[n][0] += 1.0*len(novel)
50 |         summary_ngram_novel[n][1] += len(summary_grams)
51 |         summary_ngram_novel[n][2] += 1.0 * len(novel) / (len(summary.split()) + 1e-6)
52 |         joint = gold_grams.intersection(source_grams)
53 |         novel = gold_grams - joint
54 |         gold_ngram_novel[n][0] += 1.0*len(novel)
55 |         gold_ngram_novel[n][1] += len(gold_grams)
56 |         gold_ngram_novel[n][2] += 1.0 * len(novel) / (len(gold.split()) + 1e-6)
57 | 
58 | 
59 | def cal_repeat(args):
60 |     candidate_lines = open(args.result_path+'.candidate').read().strip().split('\n')
61 |     gold_lines = open(args.result_path+'.gold').read().strip().split('\n')
62 |     src_lines = open(args.result_path+'.raw_src').read().strip().split('\n')
63 |     lines = zip(candidate_lines,gold_lines,src_lines)
64 | 
65 |     summary_ngram_novel = {1: [0, 0, 0], 2: [0, 0, 0], 4: [0, 0, 0]}
66 |     gold_ngram_novel = {1: [0, 0, 0], 2: [0, 0, 0], 4: [0, 0, 0]}
67 | 
68 |     for c,g,s in lines:
69 |         # self_repeats = cal_self_repeat(c)
70 |         cal_novel(c, g, s,summary_ngram_novel, gold_ngram_novel)
71 |     print(summary_ngram_novel, gold_ngram_novel)
72 | 
73 |     for n in summary_ngram_novel.keys():
74 |         # summary_ngram_novel[n] = summary_ngram_novel[n][2]/len(src_lines)
75 |         # gold_ngram_novel[n] = gold_ngram_novel[n][2]/len(src_lines)
76 |         summary_ngram_novel[n] = summary_ngram_novel[n][0]/summary_ngram_novel[n][1]
77 |         gold_ngram_novel[n] = gold_ngram_novel[n][0]/gold_ngram_novel[n][1]
78 |     print(summary_ngram_novel, gold_ngram_novel)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     parser = argparse.ArgumentParser()
83 |     parser.add_argument("-mode", default='', type=str)
84 |     parser.add_argument("-result_path", default='../../results/cnndm.0')
85 | 
86 | 
87 |     args = parser.parse_args()
88 |     eval(args.mode + '(args)')
89 | 


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/prepro/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Law-AI/summarization/77edff66005cca4c897e608db6d47509c9e8d029/abstractive/BertSum-Abs/PreSumm/src/prepro/__init__.py


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/prepro/utils.py:
--------------------------------------------------------------------------------
 1 | # stopwords = pkgutil.get_data(__package__, 'smart_common_words.txt')
 2 | # stopwords = stopwords.decode('ascii').split('\n')
 3 | # stopwords = {key.strip(): 1 for key in stopwords}
 4 | 
 5 | 
 6 | def _get_ngrams(n, text):
 7 |     """Calcualtes n-grams.
 8 | 
 9 |     Args:
10 |       n: which n-grams to calculate
11 |       text: An array of tokens
12 | 
13 |     Returns:
14 |       A set of n-grams
15 |     """
16 |     ngram_set = set()
17 |     text_length = len(text)
18 |     max_index_ngram_start = text_length - n
19 |     for i in range(max_index_ngram_start + 1):
20 |         ngram_set.add(tuple(text[i:i + n]))
21 |     return ngram_set
22 | 
23 | 
24 | def _get_word_ngrams(n, sentences):
25 |     """Calculates word n-grams for multiple sentences.
26 |     """
27 |     assert len(sentences) > 0
28 |     assert n > 0
29 | 
30 |     # words = _split_into_words(sentences)
31 | 
32 |     words = sum(sentences, [])
33 |     # words = [w for w in words if w not in stopwords]
34 |     return _get_ngrams(n, words)
35 | 


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/preprocess.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | 
 3 | 
 4 | import argparse
 5 | import time
 6 | 
 7 | from others.logging import init_logger
 8 | from prepro import data_builder
 9 | 
10 | 
11 | def do_format_to_lines(args):
12 |     print(time.clock())
13 |     data_builder.format_to_lines(args)
14 |     print(time.clock())
15 | 
16 | def do_format_to_bert(args):
17 |     print(time.clock())
18 |     data_builder.format_to_bert(args)
19 |     print(time.clock())
20 | 
21 | 
22 | 
23 | def do_format_xsum_to_lines(args):
24 |     print(time.clock())
25 |     data_builder.format_xsum_to_lines(args)
26 |     print(time.clock())
27 | 
28 | def do_tokenize(args):
29 |     print(time.clock())
30 |     data_builder.tokenize(args)
31 |     print(time.clock())
32 | 
33 | 
34 | def str2bool(v):
35 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
36 |         return True
37 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
38 |         return False
39 |     else:
40 |         raise argparse.ArgumentTypeError('Boolean value expected.')
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument("-pretrained_model", default='bert', type=str)
46 | 
47 |     parser.add_argument("-mode", default='', type=str)
48 |     parser.add_argument("-select_mode", default='greedy', type=str)
49 |     parser.add_argument("-map_path", default='../../data/')
50 |     parser.add_argument("-raw_path", default='../../line_data')
51 |     parser.add_argument("-save_path", default='../../data/')
52 | 
53 |     parser.add_argument("-shard_size", default=2000, type=int)
54 |     parser.add_argument('-min_src_nsents', default=3, type=int)
55 |     parser.add_argument('-max_src_nsents', default=100, type=int)
56 |     parser.add_argument('-min_src_ntokens_per_sent', default=5, type=int)
57 |     parser.add_argument('-max_src_ntokens_per_sent', default=200, type=int)
58 |     parser.add_argument('-min_tgt_ntokens', default=5, type=int)
59 |     parser.add_argument('-max_tgt_ntokens', default=500, type=int)
60 | 
61 |     parser.add_argument("-lower", type=str2bool, nargs='?',const=True,default=True)
62 |     parser.add_argument("-use_bert_basic_tokenizer", type=str2bool, nargs='?',const=True,default=False)
63 | 
64 |     parser.add_argument('-log_file', default='../../logs/cnndm.log')
65 | 
66 |     parser.add_argument('-dataset', default='')
67 | 
68 |     parser.add_argument('-n_cpus', default=2, type=int)
69 | 
70 | 
71 |     args = parser.parse_args()
72 |     init_logger(args.log_file)
73 |     eval('data_builder.'+args.mode + '(args)')
74 | 


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/translate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Law-AI/summarization/77edff66005cca4c897e608db6d47509c9e8d029/abstractive/BertSum-Abs/PreSumm/src/translate/__init__.py


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/PreSumm/src/translate/penalties.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import torch
 3 | 
 4 | 
 5 | class PenaltyBuilder(object):
 6 |     """
 7 |     Returns the Length and Coverage Penalty function for Beam Search.
 8 | 
 9 |     Args:
10 |         length_pen (str): option name of length pen
11 |         cov_pen (str): option name of cov pen
12 |     """
13 | 
14 |     def __init__(self,  length_pen):
15 |         self.length_pen = length_pen
16 | 
17 |     def length_penalty(self):
18 |         if self.length_pen == "wu":
19 |             return self.length_wu
20 |         elif self.length_pen == "avg":
21 |             return self.length_average
22 |         else:
23 |             return self.length_none
24 | 
25 |     """
26 |     Below are all the different penalty terms implemented so far
27 |     """
28 | 
29 | 
30 |     def length_wu(self, beam, logprobs, alpha=0.):
31 |         """
32 |         NMT length re-ranking score from
33 |         "Google's Neural Machine Translation System" :cite:`wu2016google`.
34 |         """
35 | 
36 |         modifier = (((5 + len(beam.next_ys)) ** alpha) /
37 |                     ((5 + 1) ** alpha))
38 |         return (logprobs / modifier)
39 | 
40 |     def length_average(self, beam, logprobs, alpha=0.):
41 |         """
42 |         Returns the average probability of tokens in a sequence.
43 |         """
44 |         return logprobs / len(beam.next_ys)
45 | 
46 |     def length_none(self, beam, logprobs, alpha=0., beta=0.):
47 |         """
48 |         Returns unmodified scores.
49 |         """
50 |         return logprobs


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/Readme.md:
--------------------------------------------------------------------------------
 1 | # Scripts and codes
 2 | 
 3 | PreSumm - Contains the dev branch of https://github.com/nlpyang/PreSumm
 4 | 
 5 | Chunker.ipynb - Script to form chunks from text files
 6 | 
 7 | # Requirements
 8 | 
 9 | multiprocess 0.70.9
10 | numpy 1.17.2
11 | pyrouge 0.1.3
12 | pytorch-transformers 1.2.0
13 | tensorboardX 1.9
14 | torch 1.1.0
15 | 
16 | # Usage
17 | 
18 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py
19 | 2. Use Chunker to chunk the input documents into chunk files
20 | 3. Use PreSumm to generate summaries for the chunk files


--------------------------------------------------------------------------------
/abstractive/BertSum-Abs/chunker.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","id":"111743d0","metadata":{"id":"111743d0"},"source":["### Script to chunk and save every file. Every line in the output file is a chunk.  "]},{"cell_type":"code","execution_count":null,"id":"6f05a1f8","metadata":{"id":"6f05a1f8"},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext \n","output_path = \"./output/\""]},{"cell_type":"code","execution_count":null,"id":"82ea4352","metadata":{"id":"82ea4352"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import os\n","import nltk\n","import sys\n","sys.path.insert(0, '../')\n","from utilities import *\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")"]},{"cell_type":"code","execution_count":null,"id":"74b25143","metadata":{"id":"74b25143"},"outputs":[],"source":["import os\n","if not os.path.exists(output_path + './'+ dataset +'_chunked/'):\n","    os.makedirs(output_path + './'+ dataset +'_chunked/')"]},{"cell_type":"code","execution_count":null,"id":"0238b231","metadata":{"id":"0238b231"},"outputs":[],"source":["for i in range(len(data_source)):\n","    doc = data_source[i].replace(\"\\n\", \" \")\n","    chunks = nest_sentences(doc, 512)\n","    save = \"\\n\".join(chunks)\n","    path = output_path +  \"./\"+ dataset +\"_chunked/\" + names[i]\n","    file = open(path,'w')\n","    file.write(save)\n","    file.close()\n","#     break"]}],"metadata":{"colab":{"name":"chunker.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5}
2 | 


--------------------------------------------------------------------------------
/abstractive/BigBird/Pretrained_Bigbird.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","id":"0e32b71c","metadata":{"id":"0e32b71c"},"source":["### Script to generate summaries using chunking based pre-trained BigBird model\n","\n","Assign the dataset and output_path variable according to requirements.  \n"]},{"cell_type":"code","execution_count":null,"id":"177e102c","metadata":{"id":"177e102c"},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext \n","output_path = \"./IN_BigBird/\""]},{"cell_type":"code","execution_count":null,"id":"80b38f7d","metadata":{"id":"80b38f7d"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import sys\n","sys.path.insert(0, '../')\n","from utilities import *\n","import os\n","import nltk\n","import torch\n","from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer"]},{"cell_type":"code","execution_count":null,"id":"2ed47d74","metadata":{"id":"2ed47d74"},"outputs":[],"source":["if not os.path.exists(output_path):\n","    os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"b6fd18d6","metadata":{"id":"b6fd18d6"},"outputs":[],"source":["#Reading the test documents\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))\n","len_dic = dict_names = get_req_len_dict(dataset, \"test\")   "]},{"cell_type":"code","execution_count":null,"id":"2f7b0310","metadata":{"id":"2f7b0310"},"outputs":[],"source":["DATASET_NAME = \"pubmed\"\n","DEVICE = \"cuda:1\"\n","CACHE_DIR = DATASET_NAME\n","MODEL_ID = f\"google/bigbird-pegasus-large-{DATASET_NAME}\"\n","\n","tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n","model = BigBirdPegasusForConditionalGeneration.from_pretrained(MODEL_ID).to(DEVICE)\n"]},{"cell_type":"code","execution_count":null,"id":"9e8f7c79","metadata":{"id":"9e8f7c79"},"outputs":[],"source":["def get_summ(input, max_l, min_l):\n","    '''\n","    Function to generate summaries from the document. This function uses a chunking-based approach.\n","    input:  nested_sentences - chunks\n","            p - Number of words in summaries per word in the document\n","    output: document summary\n","    '''\n","    nested = nest_sentences(input, 4096)\n","    summs = []\n","    for chunk in nested:\n","        inputs_dict = tokenizer(chunk, padding=\"max_length\", max_length=4096, return_tensors=\"pt\", truncation=True)\n","        inputs_dict = {k: inputs_dict[k].to(DEVICE) for k in inputs_dict}\n","        predicted_abstract_ids = model.generate(**inputs_dict, min_length=min_l, num_beams=5)\n","        result = tokenizer.decode(predicted_abstract_ids[0], skip_special_tokens=True)\n","        summs.append(result)\n","#         print(result)\n","    summ = '. '.join(summs)\n","    return summ\n","    "]},{"cell_type":"code","execution_count":null,"id":"36780f7e","metadata":{"id":"36780f7e"},"outputs":[],"source":["# main loop to generate and save summaries of each document in the test dataset\n","result = []\n","for i in range(len(data_source)):\n","    print(str(i) + \" : \" + names[i])\n","    summ = get_summ(data_source[i], len_dic[names[i]], len_dic[names[i]]-100)\n","    result.append(summ)\n","    path = output_path + names[i]\n","    file = open(path,'w')\n","    file.write(summ)\n","    file.close()\n","#     break\n","print(result)"]}],"metadata":{"colab":{"collapsed_sections":[],"name":"Pretrained_Bigbird.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5}
2 | 


--------------------------------------------------------------------------------
/abstractive/BigBird/Readme.md:
--------------------------------------------------------------------------------
 1 | # Scripts and codes
 2 | 
 3 | Pretrained_Bigbird.ipynb - Script to generate summaries using chunking based pre-trained BigBird Model 
 4 | 
 5 | # Requirements
 6 | 
 7 | transformers 4.12.3
 8 | 
 9 | # Usage
10 | 
11 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py
12 | 2. Follow the instructions given in the notebooks


--------------------------------------------------------------------------------
/abstractive/Generate_fine_tuning_data/Readme.md:
--------------------------------------------------------------------------------
 1 | # Scripts and codes
 2 | 
 3 | gen_fine_tuning_data_CLS.ipynb - Script to generate fine tuning data using CLS similarity method
 4 | 
 5 | gen_fine_tuning_data_MCS.ipynb - Script to generate fine tuning data using MCS similarity method
 6 | 
 7 | gen_fine_tuning_data_SIF.ipynb - Script to generate fine-tuning data using SIF similarity method
 8 | 
 9 | gen_fine_tuning_data_MCS_RR.ipynb - Script to generate fine-tuning data using MCS similarity method and rhetorical role-based segmenting
10 | 
11 | 
12 | # Requirements
13 | 
14 | transformers  4.12.3
15 | 
16 | pytorch  1.10
17 | 
18 | pytorch lightning  1.5.1
19 | 
20 | bert-extractive-summarizer 0.9.0
21 | 
22 | # Usage
23 | 
24 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py
25 | 2. Add the UK_freq.json or IN_freq.json to the folder
26 | 3. Follow the instructions given in the notebooks
27 | 
28 | Note that rhetorical role-based segmenting requires rhetorical role labeled documents. Each line in the document would contain a sentence, \t, and the label (sentence_1\tlabel).  
29 | 
30 | Rhetorical role labeling - https://link.springer.com/article/10.1007/s10506-021-09304-5 


--------------------------------------------------------------------------------
/abstractive/Generate_fine_tuning_data/gen_fine_tuning_data_CLS.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","id":"06bdfbbf","metadata":{"id":"06bdfbbf"},"source":["### Script to generate fine tuning data using CLS similarity method\n","\n","Change the datasets variable according to the requirements"]},{"cell_type":"code","execution_count":null,"id":"6a8faa6c","metadata":{"id":"6a8faa6c"},"outputs":[],"source":["dataset = \"IN\" # Options: IN, UK "]},{"cell_type":"code","execution_count":null,"id":"605bcef6","metadata":{"id":"605bcef6"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import json\n","import os\n","import sys\n","from tqdm import tqdm\n","sys.path.insert(0, '../')\n","from utilities import *\n","from sklearn.metrics.pairwise import cosine_similarity\n","import torch"]},{"cell_type":"code","execution_count":null,"id":"802a0612","metadata":{"id":"802a0612"},"outputs":[],"source":["#Reading the documents and summaries \n","names, data_source, data_summary = get_summary_data(dataset, \"train\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))"]},{"cell_type":"code","execution_count":null,"id":"9b0c3bff","metadata":{"id":"9b0c3bff"},"outputs":[],"source":["# Loading Model and tokenizer\n","from transformers import AutoTokenizer\n","from transformers import  BertModel\n","tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n","bert_model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states = False).to(\"cuda\")\n","bert_model.eval()"]},{"cell_type":"code","execution_count":null,"id":"c14ab500","metadata":{"id":"c14ab500"},"outputs":[],"source":["def get_sen_encoding(sents):\n","    '''\n","    Function to generate encoding for each word in the input list \n","    input: sents - List of sentences\n","    returns the list of the sentence encoding \n","    '''\n","    a = 0.001\n","    answer = None\n","    for sent in sents:\n","        ip =tokenizer(sent, return_tensors='pt', max_length=150, truncation=True, padding='max_length')\n","        tokens = tokenizer.convert_ids_to_tokens(ip['input_ids'][0])\n","        ip = ip.to(\"cuda\")\n","        bert_output = bert_model(**ip)\n","        embedding = bert_output['pooler_output'].clone().detach()\n","        embedding = embedding.to(\"cpu\")\n","        if answer == None:\n","            answer = embedding\n","            answer.resize_(1, 768)\n","        else:\n","            embedding.resize_(1, 768)\n","            answer = torch.cat((answer, embedding),0)\n","    return answer"]},{"cell_type":"code","execution_count":null,"id":"a6665614","metadata":{"id":"a6665614"},"outputs":[],"source":["def similarity_l_l(l1, l2):\n","    '''\n","    Function to find the most similar sentence in the document for each sentence in the summary \n","    input:  l1 - Summary sentences\n","            l2 - Document sentences\n","    returns a list of document sentence indexes for each sentence in the summary \n","    '''\n","    l = l1+l2\n","    sents_encodings = get_sen_encoding(l)\n","    similarities=cosine_similarity(sents_encodings)\n","    \n","    result = []\n","    for i in range(len(l1)):\n","        vals = similarities[i]\n","        vals = vals[len(l1):]\n","        idx = np.argmax(vals)\n","        result.append(idx)\n","    return result"]},{"cell_type":"code","execution_count":null,"id":"f574fa42","metadata":{"id":"f574fa42"},"outputs":[],"source":["def get_chunks_data_from_docV2(doc, summ):\n","    '''\n","    Function to generate chunks along with their summaries \n","    input:  doc - legal Document\n","            summ - Gold standard summary\n","    returns a list of chunks and their summaries \n","    '''\n","    chunk_summ_word_threshold = 150\n","    sentence_mapping = {}\n","    doc_sents = split_to_sentences(doc)\n","    summ_sents = split_to_sentences(summ)\n","    \n","    result = (similarity_l_l(summ_sents,doc_sents))\n","    \n","    for i in range(len(summ_sents)):\n","        sentence_mapping[doc_sents[result[i]]] = summ_sents[i]\n","    \n","    final_chunks = []\n","    final_summ = []\n","    for chunk in nest_sentencesV2(doc, 1024):\n","        summ = \"\"\n","        for chunk_sent in chunk:\n","            if chunk_sent in sentence_mapping:\n","                summ = summ + sentence_mapping[chunk_sent]\n","        if len(tokenizer.tokenize(summ)) >= chunk_summ_word_threshold:\n","            final_chunks.append(\" \".join(chunk))\n","            final_summ.append(summ)\n","    return final_chunks, final_summ\n"]},{"cell_type":"code","execution_count":null,"id":"680336f0","metadata":{"id":"680336f0"},"outputs":[],"source":["#loop to pass every document, generate the fine tuning data and saving in a excel file \n","import pandas as pd\n","training_chunks = []\n","training_summs = []\n","for i in tqdm(range(len(data_source))):\n","    cks, summs = get_chunks_data_from_docV2(data_source[i],data_summary[i])\n","    training_chunks = training_chunks + cks\n","    training_summs = training_summs + summs\n","#     print(i, len(training_summs), end = \", \", sep = \" : \")\n","    if i%100 == 0: \n","        full = list(zip(training_chunks,training_summs))\n","        df = pd.DataFrame(full,columns=['data', 'summary'])\n","        df.to_excel(\"FD_\"+dataset+\"_CLS_BK.xlsx\")\n","#         break\n","full = list(zip(training_chunks,training_summs))\n","df = pd.DataFrame(full,columns=['data', 'summary'])\n","df.to_excel(\"FD_\"+dataset+\"_CLS.xlsx\")"]}],"metadata":{"colab":{"collapsed_sections":[],"name":"gen_fine_tuning_data_CLS.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5}
2 | 


--------------------------------------------------------------------------------
/abstractive/Generate_fine_tuning_data/gen_fine_tuning_data_MCS.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","id":"f1087f67","metadata":{"id":"f1087f67"},"source":["### Script to generate fine tuning data using MCS similarity method\n","\n","Change the datasets variable according to the requirements"]},{"cell_type":"code","execution_count":null,"id":"a4e43880","metadata":{"id":"a4e43880"},"outputs":[],"source":["dataset = \"IN\" # Options: IN, UK "]},{"cell_type":"code","execution_count":null,"id":"d5098bbc","metadata":{"id":"d5098bbc"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import json\n","from tqdm import tqdm\n","import os\n","import sys\n","sys.path.insert(0, '../')\n","from utilities import *\n","# os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""]},{"cell_type":"code","execution_count":null,"id":"e1ea3042","metadata":{"id":"e1ea3042"},"outputs":[],"source":["#Reading the documents and summaries \n","names, data_source, data_summary = get_summary_data(dataset, \"train\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))"]},{"cell_type":"code","execution_count":null,"id":"43bdbed6","metadata":{"id":"43bdbed6"},"outputs":[],"source":["# Loading Model and tokenizer\n","from sentence_transformers import SentenceTransformer\n","from sklearn.metrics.pairwise import cosine_similarity\n","\n","sbert_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens').to(\"cuda\")\n"]},{"cell_type":"code","execution_count":null,"id":"a6665614","metadata":{"id":"a6665614"},"outputs":[],"source":["def similarity_l_l(l1, l2):\n","    '''\n","    Function to find the most similar sentence in the document for each sentence in the summary \n","    input:  l1 - Summary sentences\n","            l2 - Document sentences\n","    returns a list of document sentence indexes for each sentence in the summary \n","    '''\n","    document_embeddings = sbert_model.encode(l1+l2)\n","    similarities=cosine_similarity(document_embeddings)\n","    \n","    result = []\n","    for i in range(len(l1)):\n","        vals = similarities[i]\n","        vals = vals[len(l1):]\n","        idx = np.argmax(vals)\n","        result.append(idx)\n","    return result"]},{"cell_type":"code","execution_count":null,"id":"f574fa42","metadata":{"id":"f574fa42"},"outputs":[],"source":["def get_chunks_data_from_docV2(doc, summ):\n","    '''\n","    Function to generate chunks along with their summaries \n","    input:  doc - legal Document\n","            summ - Gold standard summary\n","    returns a list of chunks and their summaries \n","    '''\n","    chunk_summ_word_threshold = 150\n","    sentence_mapping = {}\n","    doc_sents = split_to_sentences(doc)\n","    summ_sents = split_to_sentences(summ)\n","    \n","    result = (similarity_l_l(summ_sents,doc_sents))\n","    \n","    for i in range(len(summ_sents)):\n","        sentence_mapping[doc_sents[result[i]]] = summ_sents[i]\n","    \n","    final_chunks = []\n","    final_summ = []\n","    for chunk in nest_sentencesV2(doc, 1024):\n","        summ = \"\"\n","        for chunk_sent in chunk:\n","            if chunk_sent in sentence_mapping:\n","                summ = summ + sentence_mapping[chunk_sent]\n","        if len(summ.split(\" \")) >= chunk_summ_word_threshold:\n","            final_chunks.append(\" \".join(chunk))\n","            final_summ.append(summ)\n","    return final_chunks, final_summ\n","\n"]},{"cell_type":"code","execution_count":null,"id":"680336f0","metadata":{"id":"680336f0"},"outputs":[],"source":["#loop to pass every document, generate the fine tuning data and saving in a excel file \n","import pandas as pd\n","training_chunks = []\n","training_summs = []\n","for i in tqdm(range(len(data_source))):\n","    cks, summs = get_chunks_data_from_docV2(data_source[i],data_summary[i])\n","    training_chunks = training_chunks + cks\n","    training_summs = training_summs + summs\n","#     print(i, len(training_summs), end = \", \", sep = \" : \")\n","    if i%100 == 0: \n","        full = list(zip(training_chunks,training_summs))\n","        df = pd.DataFrame(full,columns=['data', 'summary'])\n","        df.to_excel(\"FD_\" + dataset + \"_CSM_BK_512.xlsx\")\n","#         break\n","full = list(zip(training_chunks,training_summs))\n","df = pd.DataFrame(full,columns=['data', 'summary'])\n","df.to_excel(\"FD_\" + dataset + \"_CSM_512.xlsx\")"]},{"cell_type":"code","execution_count":null,"id":"7c9b32fb","metadata":{"id":"7c9b32fb"},"outputs":[],"source":[""]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.11"},"colab":{"name":"gen_fine_tuning_data_MCS.ipynb","provenance":[],"collapsed_sections":[]}},"nbformat":4,"nbformat_minor":5}


--------------------------------------------------------------------------------
/abstractive/Legal-LED/Legal-LED_generate_summary.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","id":"6cf3e374","metadata":{"id":"6cf3e374"},"source":["### Script to generate summaries using Legal-LED model\n","\n"]},{"cell_type":"code","execution_count":null,"id":"63ad1e2b","metadata":{},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext\n","output_path = \"./output/\""]},{"cell_type":"code","execution_count":null,"id":"e2895caf","metadata":{},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import sys\n","sys.path.insert(0, '../')\n","from utilities import *\n","import os\n","import nltk"]},{"cell_type":"code","execution_count":null,"id":"7d7a6d73","metadata":{},"outputs":[],"source":["if not os.path.exists(output_path):\n","    os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"af7086d5","metadata":{},"outputs":[],"source":["#Reading the test documents\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))\n","len_dic = dict_names = get_req_len_dict(dataset, \"test\")"]},{"cell_type":"code","execution_count":null,"id":"07a2744e","metadata":{},"outputs":[],"source":["import os\n","import re\n","import numpy as np\n","import pandas as pd\n","import json\n","import random\n","import nltk\n","nltk.download('punkt')\n","\n","from IPython.display import display, HTML\n","import torch\n","import datasets\n","from datasets import load_dataset, load_metric, Dataset\n","from transformers import AutoTokenizer\n","from transformers import AutoModelForSeq2SeqLM\n","from transformers import LEDTokenizer, LEDForConditionalGeneration\n","from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments"]},{"cell_type":"code","execution_count":null,"id":"3f97b9a4","metadata":{},"outputs":[],"source":["model_name = \"nsi319/legal-led-base-16384\"\n","tokenizer = AutoTokenizer.from_pretrained(model_name)"]},{"cell_type":"code","execution_count":null,"id":"7721acec","metadata":{},"outputs":[],"source":["encoder_max_length = 1024*16\n","decoder_max_length = 1024"]},{"cell_type":"code","execution_count":null,"id":"e2667bb4","metadata":{},"outputs":[],"source":["led = AutoModelForSeq2SeqLM.from_pretrained(model_name, gradient_checkpointing=True, use_cache=False)\n","# set generate hyperparameters\n","led.config.num_beams = 2\n","led.config.max_length = decoder_max_length\n","led.config.min_length = 256\n","led.config.early_stopping = True\n","led.config.no_repeat_ngram_size = 4\n"]},{"cell_type":"code","execution_count":null,"id":"8ebe2023","metadata":{},"outputs":[],"source":["led = AutoModelForSeq2SeqLM.from_pretrained(\"./final_model/IN_model\", use_cache=False)\n","led = led.to(\"cuda\")"]},{"cell_type":"code","execution_count":null,"id":"cd35ada2","metadata":{},"outputs":[],"source":["import torch\n","led.eval()\n","\n","def generate_summary_gpu(input_text, l):\n","\tinputs = tokenizer(\n","\t\tinput_text,\n","\t\tpadding=\"max_length\",\n","\t\ttruncation=True,\n","\t\tmax_length=encoder_max_length,\n","\t\treturn_tensors=\"pt\",\n","\t)\n","\tinput_ids = inputs.input_ids.to(led.device)\n","\tattention_mask = inputs.attention_mask.to(led.device)\n","\tglobal_attention_mask = torch.zeros_like(attention_mask)\n","\t# put global attention on <s> token\n","\tglobal_attention_mask[:, 0] = 1\n","\tl = max(l,256)\n","\tl = min(l,1024)\n","\toutputs = led.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask, max_length = l, num_beams = 2,repetition_penalty = 2.5,early_stopping = True)\t\n","\tpreds = [tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_space=True) for gen_id in outputs]\n","\treturn \"\".join(preds)"]},{"cell_type":"code","execution_count":null,"id":"59e1530e","metadata":{},"outputs":[],"source":["for i in range(len(data_source)):\n","    name = names[i]\n","    doc = data_source[i]\n","    wc = doc.split(\" \")\n","    input_len = len(wc)\n","    req_len = dict_names[name]\n","    print(str(i) + \": \" + name +  \" - \" + str(input_len) + \" : \" + str(req_len))\n","    \n","    abs_summ = generate_summary_gpu(doc,req_len)\n","    if len(abs_summ.split(\" \")) > req_len:\n","        abs_summ = abs_summ.split(\" \")\n","        abs_summ = abs_summ[:req_len]\n","        abs_summ = \" \".join(abs_summ)\n","    print(abs_summ)\n","    print(len((abs_summ.split(\" \"))))\n","    path = output_path + name\n","    file = open(path,'w')\n","    file.write(abs_summ)\n","    file.close()"]}],"metadata":{"accelerator":"GPU","colab":{"collapsed_sections":[],"machine_shape":"hm","name":"pegasus_finetune.ipynb","provenance":[]},"kernelspec":{"display_name":"Python 3.9.13 64-bit (microsoft store)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.13"},"vscode":{"interpreter":{"hash":"8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"}}},"nbformat":4,"nbformat_minor":5}
2 | 


--------------------------------------------------------------------------------
/abstractive/Legal-LED/Readme.md:
--------------------------------------------------------------------------------
 1 | # Scripts and codes
 2 | 
 3 | Legal-LED_generate_summary.ipynb - Script to generate summaries using Legal-LED Model 
 4 | 
 5 | Legal-LED_finetune.ipynb - Script to fine-tune Legal-LED Model 
 6 | 
 7 | # Requirements
 8 | 
 9 | transformers 4.12.3
10 | 
11 | pytorch  1.10
12 | 
13 | # Usage
14 | 
15 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py
16 | 2. Follow the instructions given in the notebooks
17 | 


--------------------------------------------------------------------------------
/abstractive/Legal-Pegasus/Readme.md:
--------------------------------------------------------------------------------
 1 | # Scripts and codes
 2 | 
 3 | pegasus-test.ipynb - Script to generate summaries using Pegasus Model 
 4 | 
 5 | pegasus-finetune.ipynb - Script to fine-tune Pegasus Model 
 6 | 
 7 | # Requirements
 8 | 
 9 | transformers 4.12.3
10 | 
11 | pytorch  1.10
12 | 
13 | # Usage
14 | 
15 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py
16 | 2. Follow the instructions given in the notebooks
17 | 


--------------------------------------------------------------------------------
/abstractive/Legal-Pegasus/pegasus-test.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","id":"8a842a0a","metadata":{"id":"8a842a0a"},"source":["### Script to generate summaries using chunking based Pegasus approach"]},{"cell_type":"code","execution_count":null,"id":"cb972436","metadata":{"id":"cb972436"},"outputs":[],"source":["dataset = \"IN\" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext\n","output_path = \"./output/\""]},{"cell_type":"code","execution_count":null,"id":"1a83915a","metadata":{"id":"1a83915a"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import glob\n","import sys\n","sys.path.insert(0, '../')\n","from utilities import *\n","import os\n","import nltk"]},{"cell_type":"code","execution_count":null,"id":"049e7a6f","metadata":{"id":"049e7a6f"},"outputs":[],"source":["if not os.path.exists(output_path):\n","    os.makedirs(output_path)"]},{"cell_type":"code","execution_count":null,"id":"0591966a","metadata":{"id":"0591966a"},"outputs":[],"source":["#Reading the test documents\n","names, data_source, data_summary = get_summary_data(dataset, \"test\")\n","print(len(names))\n","print(len(data_source))\n","print(len(data_summary))\n","len_dic = dict_names = get_req_len_dict(dataset, \"test\")  "]},{"cell_type":"code","execution_count":null,"id":"14b68b70","metadata":{"id":"14b68b70"},"outputs":[],"source":["device = \"cuda:1\""]},{"cell_type":"code","execution_count":null,"id":"0596cefd","metadata":{"id":"0596cefd"},"outputs":[],"source":["# Loading Model and tokenizer\n","from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n","from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments\n","\n","tokenizer = AutoTokenizer.from_pretrained(\"nsi319/legal-pegasus\")  \n","model = AutoModelForSeq2SeqLM.from_pretrained(\"nsi319/legal-pegasus\").to(device)\n"]},{"cell_type":"code","execution_count":null,"id":"49734483","metadata":{"id":"49734483"},"outputs":[],"source":["def summerize(text, max_len, min_len):\n","    '''\n","    Function to generate summary using Pegasus\n","    input:  nested_sentences - chunks\n","            max_l - Maximum length\n","            min_l - Minimum length\n","    output: document summary\n","    '''\n","    try:\n","        input_tokenized = tokenizer.encode(text, return_tensors='pt',max_length=512,truncation=True).to(device)\n","        summary_ids = model.generate(input_tokenized,\n","                                          num_beams=9,\n","                                          length_penalty=0.1,\n","                                          min_length=min_len,\n","                                          max_length=max_len,\n","                                    )\n","        summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]\n","        return summary\n","    except:\n","        return \"\""]},{"cell_type":"code","execution_count":null,"id":"e7f05633","metadata":{"id":"e7f05633"},"outputs":[],"source":["def summerize_doc(nested_sentences, p):\n","    '''\n","    Function to generate summary using chunking based Pegasus\n","    input:  nested_sentences - chunks\n","            p - Number of words in summaries per word in the document\n","    output: document summary\n","    '''\n","    device = 'cuda'\n","    result = []\n","    for nested in nested_sentences:\n","        l = int(p * len(nested.split(\" \")))\n","        max_len = l\n","        min_len = l-5\n","        result.append(summerize(nested, max_len, min_len))\n","    return result"]},{"cell_type":"code","execution_count":null,"id":"fd0385a1","metadata":{"id":"fd0385a1"},"outputs":[],"source":["done_files = glob.glob(output_path + \"*.txt\")\n","done_files = [i[i.rfind(\"/\")+1:] for i in done_files]"]},{"cell_type":"code","execution_count":null,"id":"0da2186e","metadata":{"id":"0da2186e"},"outputs":[],"source":["# main loop to generate and save summaries of each document in the test dataset\n","for i in range(len(data_source)):\n","    done_files = glob.glob(output_path + \"*.txt\")\n","    done_files = [i[i.rfind(\"/\")+1:] for i in done_files]\n","    name = names[i]\n","    if name in done_files:continue\n","    doc = data_source[i]\n","    input_len = len(doc.split(\" \"))\n","    req_len = dict_names[name]\n","    print(str(i) + \": \" + name +  \" - \" + str(input_len) + \" : \" + str(req_len), end = \", \")\n","    \n","    nested = nest_sentences(doc,512)\n","    p = float(req_len/input_len)\n","    print(p)\n","    abs_summ = summerize_doc(nested,p)\n","    abs_summ = \" \".join(abs_summ)\n","    print(len((abs_summ.split(\" \"))))\n","    \n","    if len(abs_summ.split(\" \")) > req_len:\n","        abs_summ = abs_summ.split(\" \")\n","        abs_summ = abs_summ[:req_len]\n","        abs_summ = \" \".join(abs_summ)\n","    print(len((abs_summ.split(\" \"))))\n","    path = output_path + name\n","    file = open(path,'w')\n","    file.write(abs_summ)\n","    file.close()\n","#     break"]},{"cell_type":"code","execution_count":null,"id":"33df9665","metadata":{"id":"33df9665"},"outputs":[],"source":[""]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.11"},"colab":{"name":"pegasus-test.ipynb","provenance":[],"collapsed_sections":[]}},"nbformat":4,"nbformat_minor":5}


--------------------------------------------------------------------------------
/abstractive/Pointer Generator/chunker.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "c4f4f89a",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "### Script to chunk text files\n",
  9 |     "\n",
 10 |     "Each chunk would have its own text file"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "d855afea",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "import glob\n",
 23 |     "import nltk\n",
 24 |     "import os\n",
 25 |     "import sys\n",
 26 |     "sys.path.insert(0, '../')\n",
 27 |     "from utilities import *"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "id": "4bc23a54",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "output_path = \"./chunked/\"\n",
 38 |     "if not os.path.exists(output_path):\n",
 39 |     "    os.makedirs(output_path)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "id": "a446e900",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "names, data_source, data_summary = get_summary_data(\"IN\", \"test\")\n",
 50 |     "print(len(names))\n",
 51 |     "print(len(data_source))\n",
 52 |     "print(len(data_summary))"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "id": "366ba48f",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "output = []\n",
 63 |     "for i in range(len(data_source)):\n",
 64 |     "    name = names[i]\n",
 65 |     "    doc = data_source[i]\n",
 66 |     "    wc = doc.split(\" \")\n",
 67 |     "    print(str(i) + \": \" + name)\n",
 68 |     "    nested = nest_sentences(doc,395)\n",
 69 |     "    \n",
 70 |     "    for i in range(len(nested)):\n",
 71 |     "        path = output_path + name[:-4] + \"_\" + str(i) + \".txt\"\n",
 72 |     "        print(path)\n",
 73 |     "        file = open(path,'w')\n",
 74 |     "        file.write(nested[i])\n",
 75 |     "        file.close()\n",
 76 |     "    \n",
 77 |     "print(output)"
 78 |    ]
 79 |   }
 80 |  ],
 81 |  "metadata": {
 82 |   "kernelspec": {
 83 |    "display_name": "Python 3",
 84 |    "language": "python",
 85 |    "name": "python3"
 86 |   },
 87 |   "language_info": {
 88 |    "codemirror_mode": {
 89 |     "name": "ipython",
 90 |     "version": 3
 91 |    },
 92 |    "file_extension": ".py",
 93 |    "mimetype": "text/x-python",
 94 |    "name": "python",
 95 |    "nbconvert_exporter": "python",
 96 |    "pygments_lexer": "ipython3",
 97 |    "version": "3.7.11"
 98 |   }
 99 |  },
100 |  "nbformat": 4,
101 |  "nbformat_minor": 5
102 | }
103 | 


--------------------------------------------------------------------------------
/abstractive/Pointer Generator/combiner.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "ee01a304",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "### Script to combine summaries generated from each chunk for each file\n",
  9 |     "\n",
 10 |     "Change the input and output path according to requirements"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "02fdfddd",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import os\n",
 21 |     "input_path = \"./chunked/\"\n",
 22 |     "output_path = \"./result/\"\n",
 23 |     "if not os.path.exists(output_path):\n",
 24 |     "    os.makedirs(output_path)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "7bafb697",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import pandas as pd\n",
 35 |     "import numpy as np\n",
 36 |     "import glob\n",
 37 |     "import sys\n",
 38 |     "sys.path.insert(0, '../')\n",
 39 |     "from utilities import *"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "id": "ed178de7",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# Reading every chunk and combining summaries to get the final summaries\n",
 50 |     "from collections import defaultdict\n",
 51 |     "\n",
 52 |     "path = input_path\n",
 53 |     "all_files = glob.glob(path + \"/*.txt\")\n",
 54 |     "\n",
 55 |     "chunk_sum = defaultdict(list)\n",
 56 |     "\n",
 57 |     "for filename in all_files:\n",
 58 |     "    with open(filename, 'r') as f: \n",
 59 |     "        p = filename.rfind(\"/\")\n",
 60 |     "        name = (filename[p+1:])\n",
 61 |     "        chunk_number = int(name[name.rfind(\"_\")+1:name.rfind(\".\")])\n",
 62 |     "        name = name[:name.rfind(\"_\")] + \".txt\"\n",
 63 |     "        chunk_sum[name].append((chunk_number, f.read()))\n",
 64 |     "#         print(chunk_number)\n",
 65 |     "#         break"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "e42a54d2",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "len(chunk_sum)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "id": "610b7898",
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "dict_names = get_req_len_dict(\"IN\", \"test\")   "
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "id": "cbf4995d",
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "\n",
 96 |     "# Writing the resultant summaries to text files\n",
 97 |     "for filename in chunk_sum.keys():\n",
 98 |     "#     print(filename)\n",
 99 |     "#     print(chunk_sum[filename])\n",
100 |     "    sorted_chunks = (sorted(chunk_sum[filename], key=lambda x: x[0]))\n",
101 |     "    summary = ''\n",
102 |     "    for i in sorted_chunks:\n",
103 |     "        summary = summary + i[1]\n",
104 |     "#     print(summary)\n",
105 |     "    generated_length = len(summary.split(\" \"))\n",
106 |     "    required_length = dict_names[filename]\n",
107 |     "    if generated_length > required_length:\n",
108 |     "        summary = summary.split(\" \")\n",
109 |     "        summary = summary[:required_length]\n",
110 |     "        summary = \" \".join(summary)\n",
111 |     "    path = output_path + filename\n",
112 |     "    file = open(path,'w')\n",
113 |     "    file.write(summary)\n",
114 |     "    file.close()\n",
115 |     "#     break"
116 |    ]
117 |   }
118 |  ],
119 |  "metadata": {
120 |   "kernelspec": {
121 |    "display_name": "Python 3.9.13 64-bit (microsoft store)",
122 |    "language": "python",
123 |    "name": "python3"
124 |   },
125 |   "language_info": {
126 |    "codemirror_mode": {
127 |     "name": "ipython",
128 |     "version": 3
129 |    },
130 |    "file_extension": ".py",
131 |    "mimetype": "text/x-python",
132 |    "name": "python",
133 |    "nbconvert_exporter": "python",
134 |    "pygments_lexer": "ipython3",
135 |    "version": "3.9.13"
136 |   },
137 |   "vscode": {
138 |    "interpreter": {
139 |     "hash": "8600bfa2ac79ae324155cec4cf6445cf6a75cd33c56c859c7f5b87c430ce23f5"
140 |    }
141 |   }
142 |  },
143 |  "nbformat": 4,
144 |  "nbformat_minor": 5
145 | }
146 | 


--------------------------------------------------------------------------------
/abstractive/Pointer Generator/readme.md:
--------------------------------------------------------------------------------
 1 | # Scripts and codes
 2 | 
 3 | pointer-generator - Repository - https://github.com/abisee/pointer-generator
 4 | 
 5 | chunker.ipynb - Script to divide a text file into chunks. Here each chunk will be saved in a text file.
 6 | 
 7 | combiner.ipynb - Script to combine the summaries generated for each chunk in a text file into the final summary 
 8 | 
 9 | # Usage
10 | 
11 | 1. Change the path variable in the get_root_path function to the path of the dataset in utilities.py
12 | 2. Use chunker to chunk the input text files into chunk text files.
13 | 3. Use the pointer generator method to summarize each chunk. 
14 | 4. Use the combiner to combine the summaries for every chunk to generate the final summaries.
15 | 


--------------------------------------------------------------------------------
/abstractive/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | This repository contains scripts related to *Abstractive Summarization*  algorithms for legal case documents.
 4 | 
 5 | | Directory | Description |
 6 | | ------------- | ------------- |
 7 | | BART based approaches | Scripts to fine-tune BART and generate summaries using BART_RR, BART and BERT-BART method |
 8 | | BertSum-Abs | Implementation of BERTSum-abs and helper scripts |
 9 | | Generate fine-tuning data | Scripts to generate fine tuning data using CLS, MCS, SIF and MCS_RR methods |
10 | | Legal-LED | Scripts to fine-tune and generate summaries using Legal-LED |
11 | | Legal-Pegasus | Scripts to fine-tune and generate summaries using Pegasus |
12 | | Pointer Generator | Implementation of Pointer-Generator and helper scripts |
13 | 
14 | 
15 | # Availability of Trained models
16 | 
17 | The following models trained on the IN-Abs & UK-Abs datasets, are available at https://zenodo.org/record/7234359#.Y2tShdJByC1
18 | 
19 | - Legal-LED
20 | - Legal-Pegasus
21 | 


--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | This folder contains dataset of the paper *Legal Case Document Summarization: Extractive and Abstractive Methods and their Evaluation* accepted at AACL-IJCNLP 2022. The dataset repository can be downloaded from https://zenodo.org/record/7152317#.Yz6mJ9JByC0 .
 4 | 
 5 | There are 3 datasets :
 6 | - IN-Abs : Indian Supreme Court case documents & their *abstractive* summaries, obtained from http://www.liiofindia.org/in/cases/cen/INSC/
 7 | - IN-Ext : Indian Supreme Court case documents & their *extractive* summaries, written by two law experts (A1, A2).
 8 | - UK-Abs : United Kingdom (U.K.) Supreme Court case documents & their *abstractive* summaries, obtained from https://www.supremecourt.uk/decided-cases/
 9 | 
10 | # Details of each dataset
11 | 
12 | ## IN-Abs
13 | We crawled 7,130 full case documents and their corresponding abstractive summaries from http://www.liiofindia.org/in/cases/cen/INSC/ .
14 | Among them, 7030 (document, summary) pairs are randomly sampled as the training dataset. The remaining 100 (document, summary)
15 | pairs are considered as the test set. The directory structure is as follows :
16 | 
17 | 
18 |     .
19 |     ├── train-data                    # folder contains documents and summaries for training
20 |     │   ├── judgement              
21 |     │   ├── summary             
22 |     │   ├── stats-IN-train.txt        # text file containing the word and sentence count statistics of the documents
23 |     └── test-data                     # folder contains documents and summaries for test
24 |     │   ├── judgement              
25 |     │   ├── summary
26 |     │   ├── stats-IN-test.txt         # text file containing the word and sentence counts of the judgements & the summaries
27 |     
28 | ## IN-Ext
29 | 
30 | This dataset contains 50 Indian Supreme Court case documents and their extractive summaries. For each document, there are two summaries written individually by two law experts A1 and A2. Each summary by each law expert is of two types : full & segment-wise.
31 | 
32 | - "full" : a coherent piece of summary.
33 | 
34 | - "segment-wise" : the following segments are considered -- 'analysis', 'argument', 'facts', 'judgement', 'statute'. A "full" summary is broken down into these segments. Each segment is a folder.
35 | 
36 | More details can be found in the associated README file of the dataset repository.
37 | 
38 | The directory structure is as follows :
39 | 
40 | 
41 |     .
42 |     ├── judgement                 # folder contains documents           
43 |     ├── summary    
44 |     │   ├── full                  # folder contains full summaries
45 |     │   │   ├── A1
46 |     │   │   ├── A2
47 |     │   ├── segment-wise          # folder contains segment-wise summaries
48 |     │   │   ├── A1
49 |     │   │   ├── A2
50 |     ├── IN-EXT-length.txt         # text file containing the word and sentence counts of the judgements & the summaries
51 |     
52 | ## UK-Abs
53 | We crawled 793 full case documents and their corresponding abstractive summaries from https://www.supremecourt.uk/decided-cases/ . Among them, 693 (document, summary) pairs are randomly sampled as the training dataset. The remaining 100 (document, summary) pairs are considered as the test set. 
54 | 
55 | Similar to IN-Ext, the summaries in the dataset are of two types -- full and segment-wise. 
56 | 
57 | The segments in the UK dataset are 'background' , 'judgement' and 'reasons'. The test-data folder contains these segment-wise and full summaries.
58 | 
59 | The directory structure is as follows :
60 | 
61 | 
62 |     .
63 |     ├── train-data                    # folder contains documents and summaries for training
64 |     │   ├── judgement              
65 |     │   ├── summary             
66 |     │   ├── stats-UK-train.txt        # text file containing the word and sentence count statistics of the documents
67 |     └── test-data                     # folder contains documents and summaries for test
68 |     │   ├── judgement              
69 |     │   ├── summary
70 |     │   │   ├── full                  # folder contains full summaries
71 |     │   │   ├── segment-wise          # folder contains segment-wise summaries
72 |     │   ├── stats-UK-test.txt         # text file containing the word and sentence counts of the judgements & the summaries
73 |     
74 | # Citation
75 | If you are using the dataset, please refer to the following paper:
76 | ```
77 | @inproceedings{bhattacharya2021,
78 |   title={Legal Case Document Summarization: Extractive and Abstractive Methods and their Evaluation},
79 |   author={Shukla, Abhay and Bhattacharya, Paheli and Poddar, Soham and Mukherjee, Rajdeep and Ghosh, Kripabandhu and Goyal, Pawan and Ghosh, Saptarshi},
80 |   booktitle={The 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing},
81 |   year={2022}
82 | }
83 | 
84 | 


--------------------------------------------------------------------------------
/extractive/CaseSummarizer/README.md:
--------------------------------------------------------------------------------
 1 | # CaseSummarizer
 2 | 
 3 | Implementation of the paper [CaseSummarizer](http://www.aclweb.org/anthology/C16-2054), COLING 2016
 4 | 
 5 | A sample input file, along with its pre-processing output and summary is present in the sample folders.
 6 | 
 7 | ### (Optional) Preprocess the documents 
 8 | Description : Tokenize, lemmatize, remove stopwords
 9 | 
10 | #### Run : 
11 | ```
12 | python preprocess.py    path/to/input/folder/   path/to/output/folder
13 | ```
14 | 
15 | ### Generate Summary :
16 | 
17 | #### Run : 
18 | ```
19 | python summary.py    path/to/input/folder/    path/to/output/folder/   path/to/dictionary.txt/file/
20 | python summary_length.py  path/to/original/doc/folder/    path/to/output/folder/from/summary.py/   path/to/output/folder/    fraction/of/original/text/length/as/summary
21 | ```
22 | 
23 | ### Prerequisites
24 |  * [Python 2.7+](https://www.python.org/download/releases/2.7/)
25 |  * [Numpy](http://www.numpy.org/)
26 |  * [NLTK](http://www.nltk.org/)
27 | 


--------------------------------------------------------------------------------
/extractive/CaseSummarizer/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/CaseSummarizer/preprocess.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jan 15 09:50:36 2019
 4 | 
 5 | @author: Paheli
 6 | """
 7 | 
 8 | #!/usr/bin/env python2
 9 | # -*- coding: utf-8 -*-
10 | """
11 | Created on Tue Sep 18 14:49:13 2018
12 | 
13 | @author: user
14 | """
15 | import sys
16 | import re
17 | import os
18 | 
19 | path_r = sys.argv[1]
20 | path_w = sys.argv[2]
21 | #path_r = "C:\\Users\\Paheli\\Downloads\\temp"
22 | #path_w = "C:\\Users\\Paheli\\Downloads\\temp_proc" 
23 | 
24 | for f in os.listdir(path_r):
25 |     
26 |     fileName = os.path.join(path_r,f)
27 |     file = open(fileName, 'r')
28 |     
29 |     documents = []
30 |     documents = (file.read()).strip().split("\n");
31 |     file.close()
32 |     
33 |     for i in range(0,len(documents)):
34 |         line = documents[i]
35 |         if line.startswith("1."):
36 |             break
37 |     doc = documents[i:]
38 |     temp = ""
39 |     for eachDocument in doc[:]:
40 |         eachDocument = re.sub(r'(\d\d\d|\d\d|\d)\.\s', ' ', eachDocument)#removes the paragraph lables 1. or 2. etc.
41 |         eachDocument = re.sub(r'(?<=[a-zA-Z])\.(?=\d)', '', eachDocument)#removes dot(.) i.e File No.1063
42 |         eachDocument = re.sub(r'(?<=\d|[a-zA-Z])\.(?=\s[\da-z])', ' ', eachDocument)#to remove the ending dot of abbr
43 |         eachDocument = re.sub(r'(?<=\d|[a-zA-Z])\.(?=\s?[\!\"\#\$\%\&\'\(\)\*\+\,\-\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~])', '', eachDocument)#to remove the ending dot of abbr
44 |         eachDocument = re.sub(r'(?<!\.)[\!\"\#\$\%\&\'\(\)\*\+\,\-\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]', ' ', eachDocument)#removes the other punctuations
45 |         
46 |         temp = temp +''+eachDocument
47 |     documents = []
48 |     temp = temp.replace("  "," ")
49 |     documents = temp.replace(" ","",1)
50 |     
51 |     file_w = open(os.path.join(path_w,f),"w")
52 |     file_w.write(documents)
53 |     file_w.close()


--------------------------------------------------------------------------------
/extractive/CaseSummarizer/sample folder/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/CaseSummarizer/sample_processed/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/CaseSummarizer/sample_shortsumm/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/CaseSummarizer/sample_shortsumm/sample input.txt:
--------------------------------------------------------------------------------
1 | But under the Income tax Act the position is somewhat different It appears that a fresh partnership deed was drawn up in the year 1945 when Gilbert was brought in The appeal therefore fails and is dismissed with costs In 1924 Mathews went out and his share was taken over by Figgies and Notley The Tribunal in spite of this document took the view that under the Partnership Act a firm could be carried on even if there was a change in its constitution The High Court refused to look into this document as it had not been relied upon before the Tribunal and no reference bad been specifically made to it in the order of the Income tax Officer or the Assistant Commissioner It is true that under the law of partnership a firm has no legal existence apart from its partners and it is merely a compendious name to describe its partners but it is also equally true that under that law there is no dissolution of the firm by the mere incoming or outgoing of partners Reference was made by Mr Daphtary to the partnership deed drawn up in 1 It was argued that a different firm was then constituted The true question to decide is one of identity of the unit assessed under the Income tax Act 1918 which paid double tax in the year 1939 with the unit to whose business the private limited company succeeded in the year 1 We have no doubt that the Tribunal and the High Court were right in holding that in spite of the mere changes in the constitution of the firm the business of the firm as originally constituted continued as tea brokers right from its inception till the time it was succeeded by the limited company and that it was the same unit all through carrying on the same business at the same place and there was no cesser of that business or any change in the unit The assessee is a partnership concern It upheld the view taken by the Tribunal A firm can be charged as a distinct assessable entity as distinct from its partners who can also be assessed individually Sections 26 48 and 55 of the Act fully bear out this position In 1939 Hillman was brought in and the partnership consisted of these three partners The Income tax Officer disallowed the claim of the assessee on the ground that the partners of the firm in 1939 being different from the partners of the firm in 1947 no relief could be given to the applicant Section 3 which is the charging section is in these terms Where any Central Act enacts that income tax shall be charged for any year at any rate or rates tax at that rate or those rates shall be charged for that year in accordance with and subject to the provisions of this Act in respect of the total income of the previous year of every individual Hindu undivided family company and local authority and of every firm and other association of persons or the partners of the firm or the members of the association individually The partners of the firm are distinct assessable entities while the firm as such is a separate and distinct unit for purposes of assessment The result is that we see no substantial grounds for disturbing the opinion given by the High Court on the question submitted to it The law with respect to retiring partners as enacted in the Partnership Act is to a certain extent a


--------------------------------------------------------------------------------
/extractive/CaseSummarizer/sample_summary/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/CaseSummarizer/summary_length.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Sep 18 15:41:27 2018
 5 | 
 6 | @author: user
 7 | """
 8 | import string
 9 | import sys
10 | import math
11 | import os
12 | 
13 | count = 0
14 | path_orig = sys.argv[1]
15 | path_sum = sys.argv[2]
16 | path_w = sys.argv[3]
17 | length = float(sys.argv[4])
18 | 
19 | for f in os.listdir(path_orig):
20 | 	url_orig = os.path.join(path_orig,f)
21 | 	url_sum = os.path.join(path_sum,f)
22 |     
23 |    	count+=1
24 |    	print count
25 |     
26 | 	fr = open(url_orig,"r")
27 | 
28 | 	doc_length = 0
29 | 	for line in fr.readlines():
30 | 	    line = line.rstrip("\n")
31 | 	    line = line.translate(None,string.punctuation)
32 | 	    ls = line.split(" ")
33 | 	    doc_length = doc_length+len(ls)
34 | 
35 | 	summ_length = math.floor(length*float(doc_length))
36 | 	print doc_length,summ_length
37 | 
38 | 
39 | 	output = ""
40 | 	fr = open(url_sum,"r")
41 | 	rem_length = summ_length
42 | 	for line in fr.readlines():
43 | 	#            line = line.rstrip("\n")
44 | 	    line2 = line.translate(None,string.punctuation)
45 | 	    ls = line2.split(" ")
46 | 	    if len(ls) < rem_length:
47 | 		output+=line
48 | 		rem_length = rem_length-len(ls)
49 | 		#print rem_length
50 | 	if len(ls) > rem_length:
51 | 	    last = ' '.join(ls[:int(rem_length)])
52 | 	    output+=last
53 | 
54 | 	file_w = open(os.path.join(path_w,f),"w")
55 |    	file_w.write(output)
56 |    	file_w.close()


--------------------------------------------------------------------------------
/extractive/Gist/README.md:
--------------------------------------------------------------------------------
 1 | ### Gist
 2 | 
 3 | Implementation of the paper [Extracting the Gist of Chinese Judgments of the Supreme Court](https://dl.acm.org/doi/10.1145/3322640.3326715), ICAIL 2019
 4 | 
 5 | Like all extractive supervised summarization methods, the training data for the model should be sentences labelled 0 (not a summary sentence) or 1 (summary sentence). 
 6 | 
 7 | We may use the [extractive_labels.py](https://github.com/Law-AI/summarization/blob/aacl/extractive/abs_to_ext/extractive_labels.py) code to create this training data.
 8 | 
 9 | ### Generate the handcrafted features for the training set
10 | 
11 | `python generate_features.py --data_path /path/to/train/documents/ --features_path /outpath/of/train/features/ --important_words cue_phrases.txt --pos_tags postags.txt --w2v True`
12 | [if a pretrained word2vec model is not present, a model is trained based on the training set corpus]
13 | 
14 | `python generate_features.py --data_path /path/to/train/documents/ --features_path /outpath/of/train/features/ --important_words cue_phrases.txt --pos_tags postags.txt --word2vec_path /path/to/trained/word2vec/model.bin`
15 | [if a pretrained word2vec model is present, mention its path]
16 | 
17 | ### Train the LGBM classifier
18 | `python train.py --data_path /path/to/train/documents/ --features_path /outpath/of/train/features/ --model_path /path/to/save/learned/model/`
19 | 
20 | *To view other options (show help):*
21 | `python train.py -h`
22 | 
23 | 
24 | ### Generate the handcrafted features for test set
25 | 
26 | `python generate_features.py --data_path /path/to/test/documents/ --features_path /outpath/of/test/features/ --important_words cue_phrases.txt --pos_tags postags.txt --word2vec_path /path/to/trained/word2vec/model.bin`
27 | 
28 | ### Infer summaries using the trained LGBM classifier
29 | 
30 | `python infer.py --data_path /path/to/test/documents/ --features_path /outpath/of/test/features/ --summary_path /path/to/save/summaries/ --model_path /path/to/trained/model.pkl --length_file /path/to/summary/length/file`
31 | The prediction probability of each sentence in stored in /outpath/of/test/features/pred_scores/ [this directory is automatically created]
32 | 
33 | - The trained model has been made publicly available at https://zenodo.org/record/7234359#.Y2fx09JBy-o
34 | 
35 | 
36 | ### format of length file
37 | 
38 | ```
39 | filename <TAB> required-summary-length-in-words
40 | ```
41 | ### External libraries required
42 | 
43 | - lightgbm = 2.3.1
44 | - sklearn = 0.21.3
45 | - gensim = 3.4.0
46 | - tqdm
47 | 


--------------------------------------------------------------------------------
/extractive/Gist/codes/generate_features.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from codes import word2vec, sent_pos, quant_feats, imp_words, word_emb, pos_tags, open_words
 3 | 
 4 | 
 5 | def features(args):
 6 |     
 7 |     data_folder = args.data_path
 8 |     
 9 |     handcrafted_features = args.important_words
10 |     postags = args.pos_tags
11 |     
12 |    
13 |     if not os.path.exists(args.features_path):
14 |         os.mkdir(args.features_path)
15 |         
16 |         
17 |     if args.w2v:
18 |         word2vec.train_word2vec(args.data_path,args.features_path)
19 |         w2v_model = args.features_path+"word2vec_model.bin"
20 |     else:
21 |         w2v_model = args.word2vec_path
22 |     
23 |     
24 |     
25 |     sent_pos_folder = args.features_path+"sent-pos\\"
26 |     if not os.path.exists(sent_pos_folder):
27 |         os.mkdir(sent_pos_folder)
28 |     
29 |     print("\n\nsentence postition")
30 |     sent_pos.run_code(data_folder, sent_pos_folder)
31 |     
32 |     print("\n\nQuantitative Features")
33 |     quant_feats.run_code(data_folder, sent_pos_folder, args.features_path+"features_quant")
34 |     
35 |     print("\n\n Important Words Features")
36 |     imp_words.run_code(handcrafted_features, data_folder, args.features_path+"features_impwords")
37 |     
38 |     print("\n\n Word Embedding Features")
39 |     word_emb.run_code(w2v_model, data_folder, args.features_path+"features_wordemb")
40 |     
41 |     print("\n\n Part-of-Speech Features")
42 |     pos_tags.run_code(postags, data_folder, args.features_path+"features_pos")
43 |     
44 |     print("\n\n Open Word Features")
45 |     open_words.run_code(w2v_model, data_folder, args.features_path+"features_openword")
46 | 


--------------------------------------------------------------------------------
/extractive/Gist/codes/generate_summary.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Aug 31 09:54:03 2020
  4 | 
  5 | @author: Paheli
  6 | """
  7 | # -*- coding: utf-8 -*-
  8 | """
  9 | Created on Tue Apr  7 20:19:23 2020
 10 | 
 11 | @author: Paheli
 12 | """
 13 | import joblib
 14 | import pandas as pd
 15 | import os
 16 | from tqdm import tqdm
 17 | import spacy
 18 | import string
 19 | from sklearn.preprocessing import StandardScaler
 20 | 
 21 | nlp = spacy.load('en_core_web_sm')
 22 | 
 23 | 
 24 | def count_word(text):
 25 |     doc = nlp(text)
 26 |     tokens = [t.text for t in doc]
 27 |     tokens = [t for t in tokens if len(t.translate(t.maketrans('', '', string.punctuation + string.whitespace))) > 0] # + string.digits
 28 |     #print(tokens)
 29 |     
 30 |     return len(tokens)
 31 | 
 32 | 
 33 | def summary(args):
 34 |         
 35 |     summary_length_file = args.length_file
 36 |     test_data_feats = args.features_path
 37 |     test_data_text = args.data_path
 38 |     
 39 |     
 40 | 
 41 |     summary_labelled_path = os.path.join(args.features_path+"pred_scores")
 42 |     if not os.path.exists(summary_labelled_path):
 43 |         os.mkdir(summary_labelled_path)
 44 |     
 45 |     if not os.path.exists(args.summary_path):
 46 |         os.mkdir(args.summary_path)
 47 |     
 48 |     
 49 |     features_a1a2 = test_data_feats+"features_quant\\"
 50 |     features_a3 = test_data_feats+"features_impwords\\"
 51 |     features_a4 = test_data_feats+"features_wordemb\\"
 52 |     features_a5 = test_data_feats+"features_pos\\"
 53 |     features_a6 = test_data_feats+"features_openword\\"
 54 |     
 55 |     
 56 |     #for f in tqdm(folds) :
 57 |     
 58 |     lgbm_pickle = joblib.load(args.model_path)
 59 |     
 60 |     fr_len = open(args.length_file,"r")
 61 |     
 62 |     for line in tqdm(fr_len.readlines()):
 63 |         line = line.rstrip("\n")
 64 |         ls = line.split("\t")
 65 |         f = ls[0]
 66 |         reqd_length = int(ls[1])
 67 |     
 68 |         
 69 |         f_a1a2 = os.path.join(features_a1a2,f)
 70 |         f_a3 = os.path.join(features_a3,f)
 71 |         f_a4 = os.path.join(features_a4,f)
 72 |         f_a5 = os.path.join(features_a5,f)
 73 |         f_a6 = os.path.join(features_a6,f)
 74 |     
 75 |     
 76 |         
 77 |         df_a1a2 = pd.read_table(f_a1a2,delimiter=' ',header=None) 
 78 |         df_a3 = pd.read_table(f_a3,delimiter=' ',header=None) 
 79 |     
 80 |         df_a4 = pd.read_table(f_a4,delimiter=' ',header=None)
 81 |         df_a5 = pd.read_table(f_a5,delimiter=' ',header=None) 
 82 |         
 83 |         df_a6 = pd.read_table(f_a6,delimiter=' ',header=None) 
 84 |     
 85 |     
 86 |         
 87 |         df_features = pd.concat([df_a1a2,df_a3,df_a4,df_a5,df_a6],axis=1)
 88 |         
 89 |         sc = StandardScaler()
 90 |         df_features = sc.fit_transform(df_features)
 91 | 
 92 |         
 93 |         fw = open(os.path.join(summary_labelled_path,f),"w")
 94 |         ypred=lgbm_pickle.predict_proba(df_features)
 95 |         i=-1
 96 |         summary_scores = {}
 97 |         sentence_pos = {}
 98 |         
 99 |         fr = open(os.path.join(test_data_text,f),"r")
100 |     
101 |         for line in fr.readlines():
102 |             line = line.rstrip("\n")
103 |             i+=1
104 |             pred = ypred[i]
105 |             sentence_pos[line] = i
106 |             summary_scores[line] = pred[1]
107 | 
108 |             fw.write(line+"$$$"+str(pred[1])+"\n")
109 |             #if (i==19):
110 |              #   break
111 |         fw.close()
112 |         fr.close()
113 |         
114 |         sort_sum = sorted(summary_scores.items(), key=lambda item: item[1], reverse = True)
115 |         sort_len = sorted(sentence_pos.items(), key=lambda item: item[1], reverse = False)
116 |     
117 |         shortlist = []
118 |         
119 |         
120 |         fw = open(os.path.join(args.summary_path,f),"w")
121 |         length_so_far = 0
122 |         for tup in sort_sum:
123 |             sent = tup[0]
124 |             length = count_word(sent)
125 |             if length_so_far+length>reqd_length:
126 |                 break
127 |             else:
128 |                 length_so_far+=length
129 |                 shortlist.append(sent)
130 |         
131 |         for tup in sort_len:
132 |             sent = tup[0]
133 |             if sent in shortlist:
134 |                 fw.write(sent+"\n")
135 |         fw.close()
136 |         print(f,reqd_length,length_so_far)
137 | 
138 |     fr_len.close()
139 |         
140 |     
141 | 


--------------------------------------------------------------------------------
/extractive/Gist/codes/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/Gist/codes/imp_words.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Apr  7 20:40:16 2020
 4 | 
 5 | @author: Paheli
 6 | """
 7 | 
 8 | # -*- coding: utf-8 -*-
 9 | """
10 | Created on Tue Apr  7 16:57:03 2020
11 | 
12 | @author: Paheli
13 | """
14 | import numpy as np
15 | from tqdm import tqdm
16 | import os
17 | 
18 | def run_code(hcf_features,data_folder,features_folder):
19 |     fr1 = open(hcf_features,"r")
20 |     if not os.path.exists(features_folder):
21 |         os.mkdir(features_folder)
22 |     features = []
23 |     for line in fr1.readlines():
24 |         line = line.rstrip("\n")
25 |         line = line.lower()
26 |         features.append(line)
27 |     
28 |     for f in tqdm(os.listdir(data_folder)):
29 |         file = os.path.join(data_folder,f)
30 |         fr2 = open(file,"r")
31 |         fw = open(os.path.join(features_folder,f),"w")
32 |         
33 |         for sentence in fr2.readlines():
34 |             sentence = sentence.rstrip("\n")
35 |             if "$$$" in sentence:
36 |                 sls = sentence.split("$$$")
37 |                 sentence = sls[0]
38 |             
39 |             onehot = np.zeros(94)
40 |             for i in range(0,94):
41 |                 f = features[i]
42 |                 if f.lower() in sentence.lower():
43 |                     onehot[i] = 1
44 |             fw.write(" ".join(map(str, onehot))+"\n")
45 |         
46 |         fw.close()
47 | 
48 |     


--------------------------------------------------------------------------------
/extractive/Gist/codes/open_words.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr 15 14:03:30 2020
 4 | 
 5 | @author: Paheli
 6 | """
 7 | from gensim.models import Word2Vec
 8 | import numpy as np
 9 | from nltk import word_tokenize
10 | from tqdm import tqdm
11 | import os
12 | 
13 | def run_code(model_path, data_folder, features_folder):
14 |     model = Word2Vec.load(model_path)
15 |     if not os.path.exists(features_folder):
16 |         os.mkdir(features_folder)
17 |     #pathr = "F:\\Summarization\\ChineseGist\\NEW SETUP\\TEST\\test_processed\\processed\\"
18 |     #pathw = "F:\\Summarization\\ChineseGist\\NEW SETUP\\features_a6\\"
19 |     
20 |     for f in tqdm(os.listdir(data_folder)):
21 |         file = os.path.join(data_folder,f)
22 |         fr = open(file,"r")
23 |         fw = open(os.path.join(features_folder,f),"w")
24 |         for sentence in fr.readlines():
25 |             sentence = sentence.rstrip("\n")
26 |             if "$$$" in sentence:
27 |                 sls = sentence.split("$$$")
28 |                 sentence = sls[0]
29 |             summation = np.zeros(100)
30 |             length=0
31 |             tokens = word_tokenize(sentence)
32 |             openwords = tokens[:5]
33 |             for word in openwords:
34 |                 if word in model.wv.vocab:
35 |                     vec = model.wv[word]
36 |                     summation = np.add(summation,vec)
37 |                     length+=1
38 |                     
39 |             if length > 0 : summation = summation/length
40 |             summation = np.around(summation,decimals=4)
41 |             fw.write(" ".join(map(str, summation))+"\n")
42 |         
43 |         fw.close()


--------------------------------------------------------------------------------
/extractive/Gist/codes/pos_tags.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr 15 12:16:56 2020
 4 | 
 5 | @author: Paheli
 6 | """
 7 | 
 8 | # -*- coding: utf-8 -*-
 9 | """
10 | Created on Tue Apr  7 23:11:24 2020
11 | 
12 | @author: Paheli
13 | """
14 | 
15 | # -*- coding: utf-8 -*-
16 | """
17 | Created on Tue Apr  7 16:44:29 2020
18 | 
19 | @author: Paheli
20 | """
21 | import numpy as np
22 | from tqdm import tqdm
23 | import nltk
24 | import os
25 | 
26 | def run_code(postags, data_folder, features_folder):
27 |     
28 |     fr1 = open(postags,"r")
29 |     if not os.path.exists(features_folder):
30 |         os.mkdir(features_folder)
31 |     features = []
32 |     for line in fr1.readlines():
33 |         line = line.rstrip("\n")
34 |         features.append(line)
35 |     
36 |     
37 |     
38 |     for f in tqdm(os.listdir(data_folder)):
39 |         file = os.path.join(data_folder,f)
40 |         fr = open(file,"r")
41 |         fw = open(os.path.join(features_folder,f),"w")
42 |         for sentence in fr.readlines():
43 |             sentence = sentence.rstrip("\n")
44 |             if "$$$" in sentence:
45 |                 sls = sentence.split("$$$")
46 |                 sentence = sls[0]
47 |             tokens = nltk.word_tokenize(sentence)
48 |             postags = nltk.pos_tag(tokens)
49 |             ptags = np.zeros(35)
50 |             count=0
51 |             if len(postags)>=10:
52 |                 for pair in postags[:10]:
53 |                     onehot = np.zeros(35)
54 |                     ptag = pair[1]
55 |                     if ptag in features:
56 |                         index = features.index(ptag)
57 |                         onehot[index] = 1
58 |                         ptags = np.add(ptags,onehot)
59 |                         count+=1
60 |                
61 |             else:
62 |                 x = len(postags)
63 |                 for pair in postags[:x]:
64 |                     onehot = np.zeros(35)
65 |                     ptag = pair[1]
66 |                     if ptag in features:
67 |                         index = features.index(ptag)
68 |                         onehot[index] = 1
69 |                         ptags = np.add(ptags,onehot)
70 |                         count+=1
71 |             ptags = ptags/count
72 |                 
73 |             #print (ptags)
74 |             fw.write(" ".join(map(str, ptags))+"\n")
75 |         
76 |         fw.close()
77 |         
78 |                 
79 | 


--------------------------------------------------------------------------------
/extractive/Gist/codes/quant_feats.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Apr  7 19:22:57 2020
 4 | 
 5 | @author: Paheli
 6 | """
 7 | import string
 8 | from nltk import word_tokenize
 9 | import os
10 | import numpy as np
11 | from tqdm import tqdm
12 | 
13 | def feature_chars(sentence):
14 |     chars = 0
15 |     for n in sentence:
16 |         chars+=1
17 |         
18 |     return chars
19 | 
20 | def feature_words(sentence):
21 |     words = len(word_tokenize(sentence))
22 |     return words
23 | 
24 | def feature_uniqwords(sentence):
25 |     w = word_tokenize(sentence)
26 |     uniqwords=len(list(dict.fromkeys(w)))
27 |     
28 |     return uniqwords
29 | 
30 | def feature_position(positions1,positions2,sentence):
31 |     p1 = positions1.get(sentence)
32 |     p2 = positions2.get(sentence)
33 |     return p1,p2
34 |  
35 | 
36 | def run_code(data_folder,data_folder_sent_pos,features_folder):       
37 |     
38 |     #data_folder_sent_pos= "F:\\Summarization\\ChineseGist\\NEW SETUP\\sent-pos\\"
39 |     #data_folder = "F:\\Summarization\\ChineseGist\\NEW SETUP\\test_processed\\processed\\"
40 |     #pathw = "F:\\Summarization\\ChineseGist\\NEW SETUP\\features_a1a2\\"
41 |     if not os.path.exists(features_folder):
42 |         os.mkdir(features_folder)
43 |         
44 |         
45 |     positions1= {}
46 |     positions2 = {}
47 |     for file in os.listdir(data_folder_sent_pos):
48 |         fr = open(os.path.join(data_folder_sent_pos,file),"r")
49 |         for line in fr.readlines():
50 |             line = line.rstrip("\n")
51 |             if "$$$" in line:
52 |                 ls = line.split("$$$")
53 |                 sent = ls[0].lstrip(" ").rstrip(" ")
54 |             else:
55 |                 sent = line
56 |             #sent = sent.translate(str.maketrans('', '', string.punctuation))
57 |             pos1 = ls[1].lstrip(" ").rstrip(" ")
58 |             pos2 = ls[2].lstrip(" ").rstrip(" ")
59 |             positions1[file+"///"+sent] = pos1
60 |             positions2[file+"///"+sent] = pos2
61 |             
62 |     for file in tqdm(os.listdir(data_folder)):
63 |         f = os.path.join(data_folder,file)       
64 |         fr1 = open(f,"r")
65 |         fw = open(os.path.join(features_folder,file),"w")
66 |         for line in fr1.readlines():
67 |             line = line.rstrip("\n")
68 |             if "$$$" in line:
69 |                 ls = line.split("$$$")
70 |                 line = ls[0]
71 | 
72 |             line_orig = line.rstrip(" ")
73 |             line = line.translate(str.maketrans('', '', string.punctuation))
74 |             s1 = int(feature_chars(line))
75 |             s2 = int(feature_words(line))
76 |             s3 = int(feature_uniqwords(line))
77 |             s4, s5 = feature_position(positions1,positions2,file+"///"+line_orig)
78 |         
79 |             sent = [s1,s2,s3,int(s4),float(s5)]
80 |             
81 |             sent = np.array(sent)
82 |             
83 |             sent = [s1,s2,s3,int(s4),float(s5)]
84 |             sent = np.array(sent)
85 |         
86 |             fw.write(" ".join(map(str, sent))+"\n")
87 |         fw.close()
88 |             
89 |         
90 |         
91 | 


--------------------------------------------------------------------------------
/extractive/Gist/codes/sent_pos.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri May  8 19:29:28 2020
 4 | 
 5 | @author: Paheli
 6 | """
 7 | 
 8 | # -*- coding: utf-8 -*-
 9 | """
10 | Created on Fri Apr  3 11:35:13 2020
11 | 
12 | @author: Paheli
13 | """
14 | import os
15 | from tqdm import tqdm
16 | 
17 | def run_code(data_folder,sent_pos_folder):
18 |     #data_folder = "F:\\Summarization\\ChineseGist\\NEW SETUP\\test_processed\\processed\\"
19 |     #pathw = "F:\\Summarization\\ChineseGist\\NEW SETUP\\sent-pos\\"
20 |     
21 |     for file in tqdm(os.listdir(data_folder)):
22 |         sent_pos = []
23 |         total_lines=0
24 |         total_lines_list = []
25 |         fr = open(os.path.join(data_folder,file),"r")
26 |         fw = open(os.path.join(sent_pos_folder,file),"w")
27 |         
28 |         for line in fr.readlines():
29 |             line = line.rstrip("\n")
30 |             #ls = line.split("$$$")
31 |             sent = line.lstrip(" ").rstrip(" ")
32 |             total_lines+=1
33 |             sent_pos.append(sent)
34 |             total_lines_list.append(total_lines)
35 |             #print(line)
36 |             
37 |         for i in range(0,len(sent_pos)):
38 |             v = total_lines_list[i]
39 |             k = sent_pos[i]
40 |             rel = float(v)/float(total_lines)
41 |             g = float("{:.5f}".format(rel))
42 |             fw.write(k+" $$$ "+ str(v)+ " $$$ " +str(g)+"\n")
43 |         fw.close()
44 |         fr.close()
45 |             


--------------------------------------------------------------------------------
/extractive/Gist/codes/train_classifier.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | from tqdm import tqdm
 3 | from sklearn.preprocessing import StandardScaler
 4 | import joblib,os
 5 | from sklearn import model_selection
 6 | from lightgbm import LGBMClassifier
 7 | 
 8 | def train_lgbm(args):
 9 |     
10 |     filelist = os.listdir(args.data_path)
11 |     
12 |     df_a1a2 = pd.DataFrame()
13 |     df_a3 = pd.DataFrame()
14 |     df_a4 = pd.DataFrame()
15 |     df_a5 = pd.DataFrame()
16 |     df_a6 = pd.DataFrame()
17 |     label = []
18 |     
19 |     for f in tqdm(filelist):
20 |         f_a1a2 = os.path.join(args.features_path+"\\features_quant",f)
21 |         f_a3 = os.path.join(args.features_path+"\\features_impwords",f)
22 |         f_a4 = os.path.join(args.features_path+"\\features_wordemb",f)
23 |         f_a5 = os.path.join(args.features_path+"\\features_pos",f)
24 |         f_a6 = os.path.join(args.features_path+"\\features_openword",f)
25 |         f_sent = os.path.join(args.features_path+"\\sent-pos",f)
26 |     
27 |     
28 |         df_a1a2 = df_a1a2.append(pd.read_table(f_a1a2,delimiter=' ',header=None)) 
29 |         df_a3 = df_a3.append(pd.read_table(f_a3,delimiter=' ',header=None))
30 |         df_a4 = df_a4.append(pd.read_table(f_a4,delimiter=' ',header=None))
31 |         df_a5 = df_a5.append(pd.read_table(f_a5,delimiter=' ',header=None))
32 |         df_a6 = df_a6.append(pd.read_table(f_a6,delimiter=' ',header=None))
33 |         
34 |       
35 |         fr2 = open(f_sent,"r")
36 |         for sentence in fr2.readlines():
37 |             sentence = sentence.rstrip("\n")
38 |             ls = sentence.split("$$$")
39 |             lab = ls[1].rstrip("\n").lstrip(" ").rstrip(" ")
40 |             label.append(int(lab))
41 |         fr2.close()
42 |     
43 |     label = pd.Series(label)
44 |     
45 |     df_features = pd.concat([df_a1a2,df_a3,df_a4,df_a5,df_a6],axis=1)
46 |     sc = StandardScaler()
47 |     df_features = sc.fit_transform(df_features)
48 | 
49 |     
50 |     print("TRAINING STARTS")
51 |     
52 | 
53 | 
54 |     lgbm = LGBMClassifier(
55 |     objective='binary',
56 |     boosting='gbdt',
57 |     learning_rate = args.lr, #0.001,
58 |     #max_depth = 8,
59 |     num_leaves = args.numleaves,#1043,
60 |     n_estimators = args.n_est,#400,
61 |     bagging_fraction = 0.8,
62 |     feature_fraction = 0.9)
63 |     #reg_alpha = 0.2,
64 |     #reg_lambda = 0.4))
65 | 
66 |     mod = lgbm.fit(df_features, label)
67 |     joblib.dump(mod,args.model_path+'summary_model.pkl')
68 |     
69 | 
70 | 
71 | 
72 |     #train_data=lgb.Dataset(df_features,label=label)
73 |     
74 |     #param = {'num_leaves':1043, 'objective':'binary','learning_rate':0.001, 'boost_from_average':False}
75 |     #param['metric'] = ['auc', 'binary_logloss']
76 |     
77 |     #training our model using light gbm
78 |     #num_round=50
79 |     
80 |     #lgbm=lgb.train(param,train_data,num_round)
81 |     #joblib.dump(lgbm,args.model_path+'summary_model.pkl')
82 | 


--------------------------------------------------------------------------------
/extractive/Gist/codes/word2vec.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from nltk import word_tokenize
 3 | from gensim.models import Word2Vec
 4 | import numpy as np
 5 | import string
 6 | from tqdm import tqdm
 7 | 
 8 | def train_word2vec(train_data_path,save_model_path):
 9 |     sentences = []
10 |     print("reading files for word2vec")
11 |     for file in tqdm(os.listdir(train_data_path)):
12 |         fr = open(os.path.join(train_data_path,file),"r")
13 |         for line in fr.readlines():
14 |             line = line.rstrip("\n")
15 |             line = (line.split("$$$")[0]).lstrip(" ").rstrip(" ")
16 |             line = line.translate(str.maketrans('', '', string.punctuation))
17 |             sentences.append(word_tokenize(line))
18 |         
19 | 
20 |         
21 |     print ("starting word2vec training")
22 |     model = Word2Vec(sentences, min_count=5)
23 |     model.save(save_model_path+"word2vec_model.bin")
24 |     # fw = open(save_model_path+"word-embs.txt","w")
25 |     # for w in list(model.wv.vocab):
26 |     #     fw.write(w+" ")
27 |     #     vec = model.wv[w]
28 |     #     fw.write(" ".join(map(str, vec))+"\n")
29 |     
30 |     # fw.close()
31 |     
32 |     
33 | 
34 | 
35 |     
36 |         
37 | 
38 | 
39 | 
40 |     


--------------------------------------------------------------------------------
/extractive/Gist/codes/word_emb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Apr  7 23:11:24 2020
 4 | 
 5 | @author: Paheli
 6 | """
 7 | 
 8 | # -*- coding: utf-8 -*-
 9 | """
10 | Created on Tue Apr  7 16:44:29 2020
11 | 
12 | @author: Paheli
13 | """
14 | from gensim.models import Word2Vec
15 | import numpy as np
16 | from nltk import word_tokenize
17 | from tqdm import tqdm
18 | import os
19 | 
20 | def run_code(model_path,data_folder,features_folder):
21 | 
22 |     model = Word2Vec.load(model_path)
23 |     if not os.path.exists(features_folder):
24 |         os.mkdir(features_folder)
25 |         
26 |     for f in tqdm(os.listdir(data_folder)):
27 |         file = os.path.join(data_folder,f)
28 |         fr = open(file,"r")
29 |         fw = open(os.path.join(features_folder,f),"w")
30 |         for sentence in fr.readlines():
31 |             sentence = sentence.rstrip("\n")
32 |             if "$$$" in sentence:
33 |                 sls = sentence.split("$$$")
34 |                 sentence = sls[0]
35 |             summation = np.zeros(100)
36 |             words = word_tokenize(sentence)
37 |             length = 0
38 |             for word in sentence:
39 |                 if word in model.wv.vocab:
40 |                     vec = model.wv[word]
41 |                     summation = np.add(summation,vec)
42 |                     length+=1
43 |                     
44 |             if length > 0 : summation = summation/length
45 |             summation = np.around(summation,decimals=4)
46 |             fw.write(" ".join(map(str, summation))+"\n")
47 |         
48 |         fw.close()
49 |             
50 | 


--------------------------------------------------------------------------------
/extractive/Gist/cue_phrases.txt:
--------------------------------------------------------------------------------
 1 | Question for consideration is
 2 | Questions for consideration are
 3 | Points for consideration are
 4 | Point for consideration is
 5 | Question that arise for consideration is
 6 | Questions that arise for consideration are
 7 | Points that arise for consideration are
 8 | Point that arise for consideration is
 9 | Question before us is
10 | Questions before us are
11 | Points before us are
12 | Point before us is
13 | Question that arise before us is
14 | Questions that arise before us are
15 | Points that arise before us are
16 | Point that arise before us is
17 | find any reasons to interfere with
18 | find anything to interfere with
19 | we are not in agreement with
20 | we do not agree with 
21 | Order under challenge in tha
22 | this order is under challenge 
23 | Relevant facts in this case 
24 | Facts of the case
25 | Facts of this case
26 | In the fact of this evidence
27 | On the basis of 
28 | Court
29 | lower court 
30 | appellate court
31 | authority 
32 | We find that
33 | We did not find that
34 | We found that
35 | This court finds that
36 | This court found that
37 | This court did not find that
38 | It was found that
39 | We agree with
40 | in our view
41 | Petitioner
42 | Respondent
43 | Appellant
44 | plaintiff
45 | Act
46 | Constitution
47 | section
48 | sections
49 | article
50 | articles
51 | v
52 | vs
53 | Vs
54 | contending
55 | contend
56 | parties
57 | No provision in
58 | We hold
59 | valid
60 | legally valid
61 | legally not valid
62 | We agree with 
63 | We are of the view that
64 | Statute
65 | In view
66 | According
67 | We are also of view
68 | petition
69 | Appeal
70 | Review
71 | Revision
72 | allowed
73 | dismissed
74 | upheld
75 | order of the court
76 | Dismiss
77 | dismissed
78 | dismissing sustained
79 | rejected 
80 | Holding
81 | Tribunal
82 | suit
83 | trial court
84 | decree
85 | defendant
86 | provisions
87 | provision
88 | appeal
89 | appeals
90 | issue
91 | issue
92 | ratio
93 | fact
94 | facts
95 | 
96 | 


--------------------------------------------------------------------------------
/extractive/Gist/generate_features.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Jun  5 17:34:18 2021
 4 | 
 5 | @author: Paheli
 6 | """
 7 | import argparse
 8 | from codes import generate_features
 9 | 
10 | def main():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--data_path', default = 'data/text/', type = str, help = 'Folder containing textual data')
13 |     parser.add_argument('--features_path', default = 'data/features/', type = str, help = 'Folder to store features of the textual data')
14 |     parser.add_argument('--important_words', default = 'data/important_words.txt', type = str, help = 'Text file containing important/domain specific words')
15 |     parser.add_argument('--pos_tags', default = 'data/postags.txt', type = str, help = 'Text file containing relevant pos tags for the task')
16 |     parser.add_argument('--w2v', default = False, type = bool, help = 'Whether to train a word2vec model (on the train data)')
17 |     parser.add_argument('--word2vec_path', default = 'data/word2vec_model.bin', type = str, help = 'Path where pretrained word2vec is present')
18 |     
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     print('Generating features for data in  ...'+args.data_path, end = ' ')
23 |     generate_features.features(args)
24 |     
25 |     
26 | if __name__ == '__main__':
27 |     main()
28 |         
29 | 


--------------------------------------------------------------------------------
/extractive/Gist/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/Gist/infer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Jun  5 18:11:53 2021
 4 | 
 5 | @author: Paheli
 6 | """
 7 | # -*- coding: utf-8 -*-
 8 | """
 9 | Created on Sat Jun  5 17:34:18 2021
10 | 
11 | @author: Paheli
12 | """
13 | import argparse
14 | from codes import generate_summary
15 | 
16 | def main():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--data_path', default = 'data/text/', type = str, help = 'Folder containing textual data')
19 |     parser.add_argument('--features_path', default = 'data/features/', type = str, help = 'Folder to store features of the textual data')
20 |     parser.add_argument('--summary_path', default = 'data/summaries/', type = str, help = 'Folder to store features of the textual data')
21 | 
22 |     parser.add_argument('--model_path', default = 'data/summary_model.pkl', type = str, help = 'Path where trained summarization model is present')
23 |     parser.add_argument('--length_file', default = 'data/length.txt', type = str, help = 'Path to file containing summary length')
24 | 
25 |     args = parser.parse_args()
26 | 
27 |     
28 |     generate_summary.summary(args)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     main()
33 |         
34 | 


--------------------------------------------------------------------------------
/extractive/Gist/length.txt:
--------------------------------------------------------------------------------
1 | 78.txt	50
2 | 232.txt	80
3 | 266.txt	40


--------------------------------------------------------------------------------
/extractive/Gist/postags.txt:
--------------------------------------------------------------------------------
 1 | CC
 2 | CD
 3 | DT
 4 | EX
 5 | FW
 6 | IN
 7 | JJ
 8 | JJR
 9 | JJS
10 | LS
11 | MD
12 | NN
13 | NNS
14 | NNP
15 | NNPS
16 | PDT
17 | POS
18 | PRP
19 | PRP$
20 | RB
21 | RBR
22 | RBS
23 | RP
24 | TO
25 | UH
26 | VB
27 | VBD
28 | VBG
29 | VBN
30 | VBP
31 | VBZ
32 | WDT
33 | WP
34 | WP$
35 | WRB


--------------------------------------------------------------------------------
/extractive/Gist/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from codes import train_classifier
 3 | 
 4 | def main():
 5 |     parser = argparse.ArgumentParser()
 6 |     
 7 |     parser.add_argument('--data_path', default = 'data/text/', type = str, help = 'Folder containing textual data')
 8 |     parser.add_argument('--features_path', default = 'features/', type = str, help = 'Folder to store features of the textual data')
 9 |     parser.add_argument('--model_path', default = 'features/', type = str, help = 'Path where trained summarization model is present')
10 |     parser.add_argument('--lr', default = 0.001, type = float, help = 'learning rate of the model')
11 |     parser.add_argument('--numleaves', default = 1043, type = int, help = 'no. of leaves')
12 |     parser.add_argument('--n_est', default = 1043, type = int, help = 'no. of estimators')
13 | 
14 | 
15 |     args = parser.parse_args()
16 | 
17 |     train_classifier.train_lgbm(args)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     main()
22 |         
23 | 


--------------------------------------------------------------------------------
/extractive/GraphicalModel/README.md:
--------------------------------------------------------------------------------
 1 | # GraphicalModel
 2 | 
 3 | Implementation of the paper [Improving Legal Document Summarization Using Graphical Models](https://www.cse.iitm.ac.in/~ravi/papers/Saravanan_jurix_06.pdf), JURIX 2006
 4 | 
 5 | ## Execution
 6 | 
 7 | ```py
 8 | $ python
 9 | >>> import graphicalModel
10 | >>> summary = graphicalModel.get_summary('sample input.txt')
11 | ```
12 | 
13 | The following code assumes the reader has some basic understanding of the algorithm beforehand. The above instruction is the only step needed to extract tehe summary, the following is just an explanation of [graphicalModel.py](graphicalModel.py)
14 | 
15 | We use ROUGE as a metric of summary evaluation. The top level file here is [graphicalModel.py](graphicalModel.py), which loops over all the testing documents, generates a summary via Graphical Model (CRF - Conditional Random Field), compares it with the manually annotated summary provided by WestLaw, computed and reports the ROUGE score for each document. 
16 | 
17 | The sample output is available at [`results_crf.txt`](results_crf.txt). The process fo generating summary for a legal document involves the following steps:
18 | 
19 | * Training the CRF (needs to be trained just once, for all test documents)
20 | * Getting predicted categories for each sentence
21 | * Rank sentences on importance based on K-mixture model
22 | * Return summary based on max length of summary allowed
23 | 
24 | ## Training the CRF
25 | 
26 | Saravnan et. al defined their own thematic roles / sections different than the one we followed, thus the first step was creating an equivalent mapping, which was as follows:
27 | 
28 | | Graphical Model category | Equivalent FIRE category |
29 | | --- | --- |
30 | | Identifying facts | Facts |
31 | | Establishing facts | Issue |
32 | | Arguing | Precedent |
33 | | Arguments | Arguments |
34 | | History | Ruling by lower court |
35 | | Arguments | Other general standards |
36 | | Ratio | Statute |
37 | | Final Decision | Ruling by present court |
38 | 
39 | The features we use for training our CRF are:
40 | * The word, in lower case
41 | * Last 3 characters of words (to detect suffixes like -ion, -ing)
42 | * Last 2 characters (suffixes, like -ed)
43 | * Whether word w is all capital (abbreviation)
44 | * Whether w is a string or number
45 | * Position of w within sentence
46 | * Position of paragraph in the document
47 | * POS tag
48 | * Presence of aforementioned cues
49 | * Beginning / End of sentence ?
50 | * The words surrounding w, i.e, the previous and the next word
51 | 
52 | This is done via [crf_train.py](crf_train.py), which runs the training algorithm for all annotated documents in `annotated` and `annotated_json` folder. The learned model is then saved in [`crf_alltrain.model`](crf_alltrain.model)
53 | 
54 | ## Predicting categories / classes
55 | 
56 | Once the CRF model is trained, for a test document we obtain the features as well as the classification via [`crf_test.py`](crf_test.py).
57 | In the current state, the test documents are HTML files, as in [`input_sample_test.html`](input_sample_test.html). The sccript handles HTML parsing and tokenization as well. The output of this script is the tokenized text, features, and category predictions for each statement.
58 | 
59 | ## Rank sentences
60 | 
61 | After obtaining the predicted classes for a test document, we need to rank the sentences on basis of importance. We use K-mixture model to do this, the code for which is available as [`k_mix_model_test.py`](k_mix_model_test.py). The returned value is a mapping of each sentence and its score based on the heuristic function, which utilizes TF-IDF: 
62 | ```
63 | K mixture model: Probability of word wi appearing k times is given as :
64 | Pi(k) = (1-r) δk, 0 + (r / s+1) * (s / s+1)^k
65 | 
66 | δk, 0 = 1 iff k = 0, else 0
67 | r = observed mean                s = observed IDF
68 | ```
69 | 
70 | ## Generating the summary
71 | 
72 | We rank the sentences based on their KMM score (obtained above), and keep adding sentences in a decreasing order of their scores, as long as the template length of summary is not filled. Have a look at [`graphicalModel.py`](graphicalModel.py)'s `get_summary` function to have a look how this is implemented.
73 | 


--------------------------------------------------------------------------------
/extractive/GraphicalModel/crf_alltrain.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Law-AI/summarization/77edff66005cca4c897e608db6d47509c9e8d029/extractive/GraphicalModel/crf_alltrain.model


--------------------------------------------------------------------------------
/extractive/GraphicalModel/crf_test.py:
--------------------------------------------------------------------------------
  1 | ###
  2 | ### For a given file, return the classification for each
  3 | ### 
  4 | import crf_train
  5 | import html2text
  6 | import pycrfsuite
  7 | import nltk
  8 | import sys
  9 | # import numpy as np
 10 | import operator
 11 | # from sklearn.metrics import classification_report
 12 | 
 13 | 
 14 | def parse_html(file):
 15 | 	'''
 16 | 		Return usable case content, as list
 17 | 	'''
 18 | 	with open(file, 'r') as f:
 19 | 		txt = f.read()
 20 | 
 21 | 	txt=(txt.replace('</?(?!(?:p class=indent)\b)[a-z](?:[^>\"\']|\"[^\"]*\"|\'[^\']*\')*>',''))
 22 | 	t = html2text.html2text(txt)
 23 | 
 24 | 	tokenized = nltk.tokenize.sent_tokenize(t)
 25 | 	start = 0 
 26 | 
 27 | 	# The Judgment was delivered by / 'for educational use only'
 28 | 	while start < len(tokenized) and 'the judgment was delivered by' not in tokenized[start].lower() :
 29 | 		start = start + 1
 30 | 
 31 | 	if start == len(tokenized):
 32 | 		start = 0
 33 | 		while 'for educational use only' not in tokenized[start].lower():
 34 | 			start = start + 1
 35 | 
 36 | 	text, indices = [], []
 37 | 	for i in range(start+1,len(tokenized)):
 38 | 		if 'thomson reuters south asia private limited' in tokenized[i].lower():
 39 | 			break
 40 | 		text.append(tokenized[i].replace('\n',' '))
 41 | 		indices.append(i-start)
 42 | 
 43 | 	return text, indices
 44 | 
 45 | 
 46 | def test_crf(file, tagbot = None):
 47 | 	'''
 48 | 		Use pre trained model (crf_alltrain) to classify file
 49 | 	'''
 50 | 	text, indices = parse_html(file)
 51 | 	length = len(indices)
 52 | 	test_sentences = []
 53 | 	for line in text:
 54 | 		tokens = nltk.tokenize.word_tokenize(line)
 55 | 		labels = [(token, '-') for token in tokens]
 56 | 
 57 | 		test_sentences.append(labels)
 58 | 
 59 | 	test_postag = crf_train.add_postag(test_sentences)
 60 | 	X_test = [crf_train.sentenceToFeatures(test_postag[i], indices[i]*1.0/length) for i in range(len(test_postag))]
 61 | 	# test is not known at all
 62 | 	# Y_test = [crf_train.sentenceToLabels(sentence) for sentence in test_postag]		
 63 | 
 64 | 	tagger = tagbot
 65 | 	if tagger == None:
 66 | 		tagger = pycrfsuite.Tagger()
 67 | 		tagger.open('crf_alltrain.model')
 68 | 	Y_pred = [tagger.tag(xseq) for xseq in X_test]
 69 | 
 70 | 	# labels = {'-': 0, '?': 0}
 71 | 	# counter = 1
 72 | 	# for category in crf_train.cue_phrases:
 73 | 	# 	labels[category] = counter
 74 | 	# 	counter += 1
 75 | 	
 76 | 	# predictions = [labels[tag] for row in Y_pred for tag in row]
 77 | 	## Test is not known
 78 | 	# # truths = [labels[tag] for row in Y_test for tag in row]
 79 | 	# c = 0
 80 | 	# for p, t in zip(predictions, truths):
 81 | 	# 	if p+t!=0:
 82 | 	# 		print(p, t)
 83 | 	# 		c+=1
 84 | 	# # predictions = np.array([labels[tag] for row in Y_pred for tag in row])
 85 | 	# # truths = np.array([labels[tag] for row in Y_test for tag in row])
 86 | 	# # # print(len(predictions))
 87 | 	# # # print(len(truths))
 88 | 	# # print(c, 'identified labels')
 89 | 	# # print(predictions)
 90 | 	# print( classification_report(truths, predictions,target_names=list(crf_train.cue_phrases.keys()) ) )
 91 | 	
 92 | 	return text, X_test, Y_pred
 93 | 
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 | 	'''
 98 | 		Nothing complex, call the MVP
 99 | 	'''
100 | 	if len(sys.argv) > 1:
101 | 		file = sys.argv[1]
102 | 	else:
103 | 		file = input('Provide file name to classify: ')
104 | 	test_crf(file)


--------------------------------------------------------------------------------
/extractive/GraphicalModel/extract-categories.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | import os
 3 | import re
 4 | import json
 5 | 
 6 | ANNOTATED_FOLDER = 'annotated'
 7 | CATEGORIES_FOLDER = 'categories'
 8 | 
 9 | category_abbr = {
10 | 	"Argument": "A", "Fact": "F", "Issue": "I",
11 | 	"Ruling by lower court": "LR", "Ruling by the present court": "R", 
12 | 	"Statute": "SS", "Precedent": "SP",
13 | 	"Other general standards including customary equitable and other extra-legal considerations": "SO",
14 | }
15 | 
16 | 
17 | def extract_loop():
18 | 	'''
19 | 		run a loop to get all sentences
20 | 		categories - 
21 | 	'''
22 | 	stmts = {}
23 | 	for file in [doc for doc in os.listdir(ANNOTATED_FOLDER) if doc.split('.')[-1] == 'txt']
24 | 		# print('Working on ', file)
25 | 		with open(ANNOTATED_FOLDER + '/' + file, 'r') as f:
26 | 			txt = f.readlines()
27 | 		txt = [i for i in txt if i!='\n']
28 | 		start = txt.index('<proposition>\n')
29 | 		
30 | 		text = '\n'.join(txt[start+1:len(txt)-1])
31 | 		t2 = re.split("S[0-9]+ ", text)
32 | 		t2 = list(filter(None, t2)) # remove any empty strings
33 | 		for line in t2:
34 | 			category = line.split(' ')[0]
35 | 			if category == 'FE' or category == 'FI':
36 | 				category = 'F'
37 | 			if category not in stmts:
38 | 				stmts[category] = []
39 | 				print(category,' first showed up')
40 | 			stmts[category].append(' '.join(line.split(' ')[1:])) # remove category
41 | 
42 | 	# JSONs
43 | 	for file in [doc for doc in os.listdir(ANNOTATED_FOLDER) if doc.split('.')[-1] == 'json']
44 | 		with open(ANNOTATED_FOLDER + '/' + file, 'r') as f:
45 | 			txt = f.readlines()
46 | 		tj = json.loads(''.join(txt))
47 | 
48 | 		for tagging in tj['annotation']:
49 | 			category = category_abbr[ tagging['label'][0] ]
50 | 			sentences = list(filter(None, tagging['points'][0]['text'].split('\n')))
51 | 			if category not in stmts:
52 | 				stmts[category] = []
53 | 				print(category,' first showed up')
54 | 
55 | 			stmts[category].extend(sentences)
56 | 
57 | 
58 | 	print('Categories detected: ', ' '.join(stmts.keys()))
59 | 
60 | 	print('Writing into files now')
61 | 	for category in stmts:
62 | 		categorized = stmts[category]
63 | 		categorized = [ i.rstrip() for i in categorized]
64 | 		categorytext = '\n'.join(categorized)
65 | 		with open(CATEGORIES_FOLDER + '/' + category + '.txt', 'w') as f:
66 | 			f.write(categorytext)
67 | 		print(category + ' done')
68 | 
69 | if __name__ == '__main__':
70 | 	extract_loop()


--------------------------------------------------------------------------------
/extractive/GraphicalModel/formulated_constants.py:
--------------------------------------------------------------------------------
  1 | categorical_phrases = dict()
  2 | categorical_pairs = dict()
  3 | # https://www.isical.ac.in/~fire/2013/legal.html
  4 | # Cannot unfortunately extract pairs automatically
  5 | 
  6 | ## Fact
  7 | # Months
  8 | # 'the appellant' excluded because appellant already covered
  9 | categorical_phrases['F'] = [
 10 | 	'appellant','dated', 'No.', 'State', 'filed', 'Government','registration', 'basis',
 11 | 	'stated', 'notice', 'period', 'region',
 12 | 	'January', 'February', 'March','April','May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',
 13 | 
 14 | 	'the plaintiff', 'the basis', 'basis of', 'Court of', 'stated that', 'filed a', 'was issued', 'the Constitution',
 15 | 	'the Corporation', 'under Article','plaintiff and', 'show cause',
 16 | 
 17 | 	'High court of', 'that the said', 'referred to as', 'stated that the', 'In the year', 
 18 | ]
 19 | 
 20 | categorical_pairs['F'] = {	
 21 | }
 22 | 
 23 | ## Issue
 24 | 
 25 | categorical_phrases['I'] = [
 26 | 	'appeal by',
 27 | 
 28 | ]
 29 | 
 30 | categorical_pairs['I'] = {
 31 | }
 32 | 
 33 | ## Argument
 34 | 
 35 | categorical_phrases['A'] = [
 36 | 	'service', 'amount', 'value','section', 'goods','credit','license','supplied',
 37 | 	'recipient', 'country', 'quota', 'contended','contention',
 38 | 
 39 | 	'under Section', 'terms of', 'counsel for', 'before us', 'charged by', 'is charged', 'free of',
 40 | 
 41 | 	'charged by the', 'provided by the', 
 42 | 
 43 | ]
 44 | 
 45 | categorical_pairs['A'] = {
 46 | }
 47 | 
 48 | ## Ruling by lower court
 49 | 
 50 | categorical_phrases['LR'] = [
 51 | 	'Tribunal', 'defendants', 'Judge', 'Bench','Division', 'trial',
 52 | 
 53 | 	'held that', 'trial court', 'the parent',
 54 | 
 55 | 	'that the defendants', 
 56 | ]
 57 | 
 58 | categorical_pairs['LR'] = {
 59 | }
 60 | 
 61 | ## Statute
 62 | 
 63 | categorical_phrases['SS'] = [
 64 | 	'provisions', 'provision', 'act', 'section', 'chapter', '(', 'explanation', 'commission', 'agent', 'estates',
 65 | 
 66 | 	'of service', 'subject to', 'deemed to', 'or tenure',
 67 | 
 68 | 	'as may be', 'the estates of', 
 69 | 
 70 | 	'the purpose of this', 'in a case where', 'as may be prescribed', 
 71 | ]
 72 | 
 73 | categorical_pairs['SS'] = {
 74 | }
 75 | 
 76 | ## Precedent
 77 | 
 78 | categorical_phrases['SP'] = [
 79 | 	'v', 'Vs', 'vs', 'fact', 'SC', '&', 'conclusion', 'finding', 'SCC', 'S.C.R.', 'SCR',
 80 | 	'carrying', 'observed',
 81 | 
 82 | 	'this court', 'of law', 'question of', 'laid down', 'the question', 'was whether', 'decision was', 
 83 | 	'evidence to', 'this Court', 'a finding',
 84 | 
 85 | 	'it was held', 'question of fact',
 86 | ]
 87 | 
 88 | categorical_pairs['SP'] = {
 89 | }
 90 | 
 91 | ## Other general standards
 92 | years = [str(x) for x in list(range(1900, 2100))]
 93 | categorical_phrases['SO'] = [
 94 | 	'Civil', 'Appeal', 
 95 | 
 96 | 	'Appeal No.',
 97 | ]
 98 | 
 99 | categorical_phrases['SO'].extend(years)
100 | categorical_pairs['SO'] = {
101 | }
102 | 
103 | ## Ruling by the present court
104 | 
105 | categorical_phrases['R'] = [
106 | 	'Dismiss', 'dismissed', 'dismissing', 'sustained', 'rejected', 'allowed','passed','case','considered','aside',
107 | 	'preliminary','judgement','accordingly', 'amended', 'allow','owed', 'decree',
108 | 
109 | 	'the case', 'the Amending', 'set aside', 'order to',' appeal is', 'the Section', 'execution of', 'Appeal allowed',
110 | 
111 | 	'the part of', 'to the suit', 'of the Court', 
112 | 
113 | ]
114 | 
115 | categorical_pairs['R'] = {
116 | }
117 | 
118 | 
119 | def include_toggled_list(l):
120 | 	'''
121 | 		If list has summary, it should also have Summary and summary.
122 | 	'''
123 | 	l_ = list()
124 | 	for item in l:
125 | 		l_.append(item + '.')
126 | 	l += l_
127 | 	l_ = list()
128 | 
129 | 	for item in l:
130 | 		first_toggled = item[0].lower() if item[0].isupper() else item[0].upper()
131 | 		l_.append(first_toggled + item[1:])
132 | 	l += l_
133 | 	l = list(set(l))
134 | 
135 | 
136 | def include_toggled_dict(d):
137 | 	'''
138 | 		If dict has summary: x, it should also have Summary: x
139 | 	'''
140 | 	d_ = dict()
141 | 	for key in d:
142 | 		first_toggled = key[0].lower() if key[0].isupper() else key[0].upper()
143 | 		d_[first_toggled + key[1:]] = d[key]
144 | 	# d = {**d, **d_}
145 | 	d.update(d_)
146 | 
147 | 
148 | def include_toggled_phrases(phrases):
149 | 	'''
150 | 		Iterator for each category
151 | 	'''
152 | 	for category in phrases:
153 | 		include_toggled_list(phrases[category])
154 | 
155 | 
156 | def include_toggled_pairs(pairs):
157 | 	'''
158 | 		Iterator for each categorical pair
159 | 	'''
160 | 	total_pairs, c = sum([ len(pairs[category]) for category in pairs ]), 0
161 | 	for category in pairs:
162 | 		# 'If': ['so be'],
163 | 		# to be extended to
164 | 		# 'If': ['so be', 'So be'], 'if': ['so be', 'So be']
165 | 		
166 | 		if category == 'arguing':
167 | 			# print('Passing', category)
168 | 			for key in pairs[category]:
169 | 				pairs[category][key] = list( set( pairs[category][key] ) )
170 | 			pass
171 | 		# print('Not passing', category)
172 | 		for key in pairs[category]:
173 | 			# print ( pairs[category][key] )
174 | 			include_toggled_list( pairs[category][key] )
175 | 			c += 1
176 | 			# print(key, " => ", c, "/", total_pairs)
177 | 
178 | 	c = 0
179 | 	for category in pairs:
180 | 		d2 = {}
181 | 		for key in pairs[category]:
182 | 			k2 = key[0].upper() if key[0].islower() else key[0].lower()
183 | 			d2[k2 + key[1:]] = pairs[category][key]
184 | 			c += 1
185 | 			# print(key, " => ", c, "/", total_pairs)			
186 | 		pairs[category].update(d2)
187 | 
188 | 	# make a set
189 | 	for category in pairs:
190 | 		for key in pairs[category]:
191 | 			pairs[category][key] = list( set( pairs[category][key] ) )


--------------------------------------------------------------------------------
/extractive/GraphicalModel/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/GraphicalModel/k_mix_model_test.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import math
  4 | import html2text
  5 | import operator
  6 | import itertools
  7 | import operator
  8 | import nltk
  9 | from nltk import tokenize
 10 | from nltk.corpus import stopwords, wordnet as wn
 11 | import crf_test
 12 | 
 13 | # 
 14 | # python k-mix-model-test.py ../FullText_html/2010/2010_A_1.html 
 15 | # Pg 139 / 207 in thesis
 16 | #
 17 | 
 18 | stopwords = set(stopwords.words('english'))
 19 | # intentionally skipping out quotes here
 20 | punctuation = '!#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
 21 | for each in punctuation:
 22 | 	stopwords.add(each)
 23 | 
 24 | 
 25 | def accumulate(l):
 26 | 	'''
 27 | 		Group By Key
 28 | 	'''
 29 | 	it = itertools.groupby(l, operator.itemgetter(0))
 30 | 	for key, subiter in it:
 31 | 		yield key, sum(item[1] for item in subiter)
 32 | 
 33 | 
 34 | # Don't process <img> tags, just strip them out. Use an indent of 4 spaces 
 35 | # and a page that's 80 characters wide.
 36 | 
 37 | # 1st Tokenize and form sentence
 38 | def tokenizeSentence(fileContent):
 39 |     """
 40 |     :param fileContent: fileContent is the file content which needs to be summarized
 41 |     :return: Returns a list of string(sentences)
 42 |     """
 43 |     return tokenize.sent_tokenize(fileContent)
 44 | 
 45 | 
 46 | # 2nd Case Folding
 47 | def caseFolding(line):
 48 |     """
 49 |     :param line is the input on which case folding needs to be done
 50 |     :return: Line with all characters in lower case
 51 |     """
 52 |     return line.lower()
 53 | 
 54 | 
 55 | # 3rd Tokenize and form tokens from sentence
 56 | def tokenizeLine(sentence):
 57 |     """
 58 |     :param sentence: Sentence is the english sentence of the file
 59 |     :return: List of tokens
 60 |     """
 61 |     return tokenize.word_tokenize(sentence)
 62 | 
 63 | 
 64 | # 4th Stop Word Removal
 65 | def stopWordRemove(tokens):
 66 |     """
 67 |     :param tokens: List of Tokens
 68 |     :return: List of tokens after removing stop words
 69 |     """
 70 |     # list_tokens = []
 71 |     # for token in tokens:
 72 |     #     if token not in stopwords:
 73 |     #         list_tokens.append(token)
 74 |     # return list_tokens
 75 |     list_tokens = [token for token in tokens if token not in stopwords]
 76 |     return list_tokens
 77 | 
 78 | 
 79 | def KMM(eval_file):
 80 | 
 81 | 	test_sentences, _ = crf_test.parse_html(eval_file)
 82 | 	t2 = [caseFolding(sentence) for sentence in test_sentences]
 83 | 	test_sentences = t2
 84 | 
 85 | 	index, docId = {}, 1
 86 | 
 87 | 	for sentence in test_sentences:
 88 | 		sentence = sentence.strip()
 89 | 		if len(sentence) == 0:
 90 | 			continue
 91 | 
 92 | 		tokens = nltk.tokenize.word_tokenize(sentence)
 93 | 		final_tokens = stopWordRemove(tokens)
 94 | 		for token in final_tokens:
 95 | 			if token not in index:
 96 | 				index[token] = [ (docId, 1) ]
 97 | 			else:
 98 | 				last_entry = index[token][-1]
 99 | 				if last_entry[0] == docId:
100 | 					index[token][-1] = (docId, last_entry[1]+1)
101 | 				else:
102 | 					index[token].append( (docId, 1) )
103 | 
104 | 		docId = docId + 1
105 | 	# for key in index:
106 | 	# 	getList=index[key]
107 | 	# 	index[key]=list(accumulate(getList))
108 | 
109 | 	N, cf, tf, df = 360, 0, 0, 0
110 | 	num= math.log(N)/math.log(2)
111 | 	kmixresult={}
112 | 	sent_id=1
113 | 
114 | 	for sentence in test_sentences:
115 | 		pk=0
116 | 		myline=tokenizeLine(sentence)
117 | 		final_tokens=stopWordRemove(myline)
118 | 
119 | 		for token in final_tokens:
120 | 			if token not in index:
121 | 				continue
122 | 			tokenlist = index[token]
123 | 			cf = 0		
124 | 			for x,y in tokenlist:
125 | 				if x == 1:
126 | 					tf = y
127 | 					df=len(tokenlist)
128 | 					
129 | 				cf = cf + y
130 | 			#print 'token='+token
131 | 			t= cf * 1.0 / N
132 | 			#print 't='+str(t)
133 | 			idf = num * 1.0 / df if df else 0
134 | 			#print 'idf='+str(idf)
135 | 			s = ( ( (cf-df) * 1.0) / df ) if df else 0
136 | 			#print 's='+str(s)
137 | 			r=1
138 | 			r = t/s if s else 0
139 | 			#print 'r='+str(r)
140 | 			pk = (r / (s+1) ) * (s / (s+1) ) + math.pow( (s / (s+1) ), tf)
141 | 
142 | 		#print sentence
143 | 		#print pk
144 | 		kmixresult[sent_id] = pk
145 | 		sent_id = sent_id + 1
146 | 		# print(sentence, kmixresult[sent_id-1])
147 | 
148 | 	# for sentence in test_sentences:
149 | 	# 	print(sentence)
150 | 
151 | 	# print(len(test_sentences))
152 | 	# print(kmixresult)
153 | 
154 | 	# kmix_sorted = sorted(kmixresult.items(), key=operator.itemgetter(1),reverse=True)
155 | 	# for key, value in kmix_sorted:
156 | 	# 	print(test_sentences[key-1])
157 | 
158 | 	return kmixresult
159 | 
160 | 
161 | if __name__ == '__main__':
162 | 	if len(sys.argv) > 1:
163 | 		file = sys.argv[1]
164 | 	else:
165 | 		file = input('Input file to compute scores for: ')
166 | 	
167 | 	KMM(file)


--------------------------------------------------------------------------------
/extractive/GraphicalModel/k_mix_output.txt:
--------------------------------------------------------------------------------
  1 | 28	38178520590013.51
  2 | 82	7881645488754.992
  3 | 101	33575773026.61672
  4 | 22	1364470.2612857409
  5 | 54	72495.7513888907
  6 | 108	43626.25160667495
  7 | 102	29825.09064748152
  8 | 31	22892.134259259423
  9 | 43	22892.134259259423
 10 | 75	22892.134259259423
 11 | 63	3788.987901833185
 12 | 65	997.7629629629637
 13 | 80	898.7824074074077
 14 | 109	647.6126543209858
 15 | 83	607.6975349433403
 16 | 88	25.40546790173725
 17 | 72	22.837191358024697
 18 | 61	22.556018518518613
 19 | 113	11.727969348658974
 20 | 95	10.322311024562635
 21 | 7	10.185326254232073
 22 | 21	5.3516590024585176
 23 | 67	5.3516590024585176
 24 | 66	4.360383345537698
 25 | 99	4.142083258445258
 26 | 119	3.8282828282828305
 27 | 107	3.784522803294937
 28 | 116	3.636534839924669
 29 | 73	3.349537037037038
 30 | 48	3.3184104052444177
 31 | 52	3.30304609876581
 32 | 57	3.254129263116958
 33 | 44	3.1158968880122733
 34 | 118	3.087994034302757
 35 | 37	3.034656814425525
 36 | 15	2.787560079163129
 37 | 86	2.559601742460641
 38 | 87	2.559601742460641
 39 | 85	2.183842592592593
 40 | 55	1.886114937251301
 41 | 13	1.7900508351488742
 42 | 2	1.708509142053446
 43 | 24	1.7067547055251968
 44 | 5	1.6447378071140446
 45 | 71	1.6053839218632604
 46 | 19	1.5865484429065748
 47 | 33	1.5583498677248677
 48 | 115	1.5264147396127088
 49 | 25	1.506456182958915
 50 | 30	1.4200867052023125
 51 | 117	1.3602677189724544
 52 | 35	1.3319502074688796
 53 | 41	1.2317673378076062
 54 | 53	1.1426746306939062
 55 | 100	1.1299515121630506
 56 | 46	1.126531165311653
 57 | 89	1.1068399452804376
 58 | 34	1.0748245221027481
 59 | 64	1.065339723526343
 60 | 12	1.0566274319459612
 61 | 90	1.0566274319459612
 62 | 50	1.0496062992125987
 63 | 40	1.025999243207236
 64 | 94	1.0007581018518519
 65 | 112	0.9680157946692991
 66 | 26	0.9399547661981307
 67 | 84	0.9399547661981307
 68 | 8	0.8745990836197023
 69 | 6	0.8711131862512143
 70 | 32	0.8671118149451424
 71 | 49	0.8671118149451424
 72 | 74	0.8287825324668168
 73 | 36	0.8269119181501733
 74 | 47	0.8125956535047445
 75 | 76	0.8116279069767441
 76 | 51	0.796712744713269
 77 | 58	0.7880797785666196
 78 | 4	0.7046274333689918
 79 | 91	0.7046274333689918
 80 | 120	0.7046274333689918
 81 | 16	0.636318407960199
 82 | 1	0.6254374882950248
 83 | 14	0.6235031663788141
 84 | 23	0.48043987891327417
 85 | 29	0.48043987891327417
 86 | 9	0.2658674726287928
 87 | 11	0.2658674726287928
 88 | 17	0.2658674726287928
 89 | 18	0.2658674726287928
 90 | 20	0.2658674726287928
 91 | 45	0.2658674726287928
 92 | 56	0.2658674726287928
 93 | 59	0.2658674726287928
 94 | 68	0.2658674726287928
 95 | 70	0.2658674726287928
 96 | 77	0.2658674726287928
 97 | 78	0.2658674726287928
 98 | 81	0.2658674726287928
 99 | 92	0.2658674726287928
100 | 97	0.2658674726287928
101 | 104	0.2658674726287928
102 | 106	0.2658674726287928
103 | 114	0.2658674726287928
104 | 93	0
105 | 111	-0.9504830917874396
106 | 3	-9.399999999999793
107 | 96	-9.399999999999793
108 | 38	-43.83194444444445
109 | 42	-88.66388888888876
110 | 103	-102.86783284742458
111 | 10	-326.65568186028565
112 | 60	-496.4928232689209
113 | 69	-23102081326.034996
114 | 79	-4.247436601304027e+18
115 | 98	-2.909398467025563e+33
116 | 105	-2.6681765651173983e+36
117 | 110	-1.289125498056627e+52
118 | 62	-2.1985642013044388e+70
119 | 27	-2.5956783858709018e+116
120 | 39	-1.6232370264991055e+162
121 | 


--------------------------------------------------------------------------------
/extractive/LetSum/README.md:
--------------------------------------------------------------------------------
 1 | # LetSum
 2 | 
 3 | Implementation of the paper [LetSum, a Text Summarization system in Law field](http://rali.iro.umontreal.ca/rali/?q=en/node/673), JURIX 2004
 4 | 
 5 | ## Execution
 6 | 
 7 | ``` 
 8 | $ python
 9 | >>> import letsum
10 | >>> import letsum_test
11 | >>> summary = letsum_test.LetSum('sample input.txt')
12 | ```
13 | 
14 | We use ROUGE as a metric of summary evaluation. The top level file here is [letsum.py](letsum.py), which loops over all the testing documents, generates a summary, compares it with the manually annotated summary provided by WestLaw, computed and reports the ROUGE score for each document. Following is an explanation of how the top level file operates.
15 | 
16 | The sample output is available at [`results_letsum.txt`](results_letsum.txt). The summary is generated as follows:
17 | 
18 | We first define the mapping as done previously in the following order:
19 | 
20 | | FIRE category | Equivalent LetSum category |
21 | | --- | --- |
22 | | Facts | Introduction |
23 | | Issue | Introduction |
24 | | Arguments | Context |
25 | | Ruling by lower court | Context |
26 | | Statute | Analysis |
27 | | Precedent | Analysis |
28 | | Other general standards | Conclusion |
29 | | Ruling by present court | Conclusion |
30 | 
31 | This is required since LetSum follows a template filling task, where each theme is allotted a percentage of the total length of permissible summary:
32 | 
33 | | Category | Percentage in summary |
34 | | --- | --- |
35 | | Introduction | 10 |
36 | | Context | 24 |
37 | | Analysis | 60 |
38 | | Conclusion | 6 |
39 | | Citation | 0 |
40 | 
41 | The total length of summary is restricted to *34%* of the total document length. Note that unlike Graphical Model, we do not need to train any model beforehand.
42 | 
43 | [`letsum_test.py`](letsum_test.py) expects an argument, in form of [`input_sample_test.html`](input_sample_test.html), for the legal document to be summarized. 
44 | 
45 | ### Labelling each sentence
46 | 
47 | For each sentence in the document, we compute a score for that sentence indicating the number of cue phrases / cue pairs present in that sentence. The category with highest score is assigned as the label for that sentence (Introduction/Context/..). In the absence of any cues, a default label of _Context_ is assigned to the sentence.
48 | 
49 | ### Ranking importance of sentences
50 | 
51 | Once the categories for each sentence has been identified, we want to retrieve the most relevant phrases within that category. Importance is assigned via a heuristic function, consisting of position of paragraph in document, position of sentence in paragraph, TF-IDF, and other factors. However, this function was not available, due to which we rank sentences based on their TF-IDF scores within each category. Hear, we avoid ranking stop words to keep the summary as informative as possible.
52 | 
53 | ### Generating the summary
54 | 
55 | After ranking the phrases/ words in order of their TF-IDF scores within each category, we sort them in decreasing order and keep adding phrases in the output summary until the threshold of words in that category is reached. 
56 | 
57 | Say, for instance, a document of length 1600 words will have a summary of at most 1600*34% = 544 words, of which Context will be 24% = 130 words. In the decreasing order of TF-IDF scores, we keep adding words belonging to the category of "Context" until the output summary has less than 130 words.
58 | 
59 | ### Grammatic modifications
60 | 
61 | Since the phrases might not make sense on their own, certain grammatic corrections have to be made in order for the summary to make sense. However, no further work from the Authors of LetSum focused on this aspect, due to which this phase was not applied in our work.
62 | 
63 | 


--------------------------------------------------------------------------------
/extractive/LetSum/extract-categories.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | import os
 3 | import re
 4 | import json
 5 | 
 6 | ANNOTATED_FOLDER = 'annotated'
 7 | CATEGORIES_FOLDER = 'categories'
 8 | 
 9 | category_abbr = {
10 | 	"Argument": "A", "Fact": "F", "Issue": "I",
11 | 	"Ruling by lower court": "LR", "Ruling by the present court": "R", 
12 | 	"Statute": "SS", "Precedent": "SP",
13 | 	"Other general standards including customary equitable and other extra-legal considerations": "SO",
14 | }
15 | 
16 | 
17 | def extract_loop():
18 | 	'''
19 | 		run a loop to get all sentences
20 | 		categories - 
21 | 	'''
22 | 	stmts = {}
23 | 	for file in [doc for doc in os.listdir(ANNOTATED_FOLDER) if doc.split('.')[-1] == 'txt']
24 | 		# print('Working on ', file)
25 | 		with open(ANNOTATED_FOLDER + '/' + file, 'r') as f:
26 | 			txt = f.readlines()
27 | 		txt = [i for i in txt if i!='\n']
28 | 		start = txt.index('<proposition>\n')
29 | 		
30 | 		text = '\n'.join(txt[start+1:len(txt)-1])
31 | 		t2 = re.split("S[0-9]+ ", text)
32 | 		t2 = list(filter(None, t2)) # remove any empty strings
33 | 		for line in t2:
34 | 			category = line.split(' ')[0]
35 | 			if category == 'FE' or category == 'FI':
36 | 				category = 'F'
37 | 			if category not in stmts:
38 | 				stmts[category] = []
39 | 				print(category,' first showed up')
40 | 			stmts[category].append(' '.join(line.split(' ')[1:])) # remove category
41 | 
42 | 	# JSONs
43 | 	for file in [doc for doc in os.listdir(ANNOTATED_FOLDER) if doc.split('.')[-1] == 'json']
44 | 		with open(ANNOTATED_FOLDER + '/' + file, 'r') as f:
45 | 			txt = f.readlines()
46 | 		tj = json.loads(''.join(txt))
47 | 
48 | 		for tagging in tj['annotation']:
49 | 			category = category_abbr[ tagging['label'][0] ]
50 | 			sentences = list(filter(None, tagging['points'][0]['text'].split('\n')))
51 | 			if category not in stmts:
52 | 				stmts[category] = []
53 | 				print(category,' first showed up')
54 | 
55 | 			stmts[category].extend(sentences)
56 | 
57 | 
58 | 	print('Categories detected: ', ' '.join(stmts.keys()))
59 | 
60 | 	print('Writing into files now')
61 | 	for category in stmts:
62 | 		categorized = stmts[category]
63 | 		categorized = [ i.rstrip() for i in categorized]
64 | 		categorytext = '\n'.join(categorized)
65 | 		with open(CATEGORIES_FOLDER + '/' + category + '.txt', 'w') as f:
66 | 			f.write(categorytext)
67 | 		print(category + ' done')
68 | 
69 | if __name__ == '__main__':
70 | 	extract_loop()


--------------------------------------------------------------------------------
/extractive/LetSum/formulated_constants.py:
--------------------------------------------------------------------------------
  1 | categorical_phrases = dict()
  2 | categorical_pairs = dict()
  3 | # https://www.isical.ac.in/~fire/2013/legal.html
  4 | # Cannot unfortunately extract pairs automatically
  5 | 
  6 | ## Fact
  7 | # Months
  8 | # 'the appellant' excluded because appellant already covered
  9 | categorical_phrases['F'] = [
 10 | 	'appellant','dated', 'No.', 'State', 'filed', 'Government','registration', 'basis',
 11 | 	'stated', 'notice', 'period', 'region',
 12 | 	'January', 'February', 'March','April','May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',
 13 | 
 14 | 	'the plaintiff', 'the basis', 'basis of', 'Court of', 'stated that', 'filed a', 'was issued', 'the Constitution',
 15 | 	'the Corporation', 'under Article','plaintiff and', 'show cause',
 16 | 
 17 | 	'High court of', 'that the said', 'referred to as', 'stated that the', 'In the year', 
 18 | ]
 19 | 
 20 | categorical_pairs['F'] = {	
 21 | }
 22 | 
 23 | ## Issue
 24 | 
 25 | categorical_phrases['I'] = [
 26 | 	'appeal by',
 27 | 
 28 | ]
 29 | 
 30 | categorical_pairs['I'] = {
 31 | }
 32 | 
 33 | ## Argument
 34 | 
 35 | categorical_phrases['A'] = [
 36 | 	'service', 'amount', 'value','section', 'goods','credit','license','supplied',
 37 | 	'recipient', 'country', 'quota', 'contended','contention',
 38 | 
 39 | 	'under Section', 'terms of', 'counsel for', 'before us', 'charged by', 'is charged', 'free of',
 40 | 
 41 | 	'charged by the', 'provided by the', 
 42 | 
 43 | ]
 44 | 
 45 | categorical_pairs['A'] = {
 46 | }
 47 | 
 48 | ## Ruling by lower court
 49 | 
 50 | categorical_phrases['LR'] = [
 51 | 	'Tribunal', 'defendants', 'Judge', 'Bench','Division', 'trial',
 52 | 
 53 | 	'held that', 'trial court', 'the parent',
 54 | 
 55 | 	'that the defendants', 
 56 | ]
 57 | 
 58 | categorical_pairs['LR'] = {
 59 | }
 60 | 
 61 | ## Statute
 62 | 
 63 | categorical_phrases['SS'] = [
 64 | 	'provisions', 'provision', 'act', 'section', 'chapter', '(', 'explanation', 'commission', 'agent', 'estates',
 65 | 
 66 | 	'of service', 'subject to', 'deemed to', 'or tenure',
 67 | 
 68 | 	'as may be', 'the estates of', 
 69 | 
 70 | 	'the purpose of this', 'in a case where', 'as may be prescribed', 
 71 | ]
 72 | 
 73 | categorical_pairs['SS'] = {
 74 | }
 75 | 
 76 | ## Precedent
 77 | 
 78 | categorical_phrases['SP'] = [
 79 | 	'v', 'Vs', 'vs', 'fact', 'SC', '&', 'conclusion', 'finding', 'SCC', 'S.C.R.', 'SCR',
 80 | 	'carrying', 'observed',
 81 | 
 82 | 	'this court', 'of law', 'question of', 'laid down', 'the question', 'was whether', 'decision was', 
 83 | 	'evidence to', 'this Court', 'a finding',
 84 | 
 85 | 	'it was held', 'question of fact',
 86 | ]
 87 | 
 88 | categorical_pairs['SP'] = {
 89 | }
 90 | 
 91 | ## Other general standards
 92 | years = [str(x) for x in list(range(1900, 2100))]
 93 | categorical_phrases['SO'] = [
 94 | 	'Civil', 'Appeal', 
 95 | 
 96 | 	'Appeal No.',
 97 | ]
 98 | 
 99 | categorical_phrases['SO'].extend(years)
100 | categorical_pairs['SO'] = {
101 | }
102 | 
103 | ## Ruling by the present court
104 | 
105 | categorical_phrases['R'] = [
106 | 	'Dismiss', 'dismissed', 'dismissing', 'sustained', 'rejected', 'allowed','passed','case','considered','aside',
107 | 	'preliminary','judgement','accordingly', 'amended', 'allow','owed', 'decree',
108 | 
109 | 	'the case', 'the Amending', 'set aside', 'order to',' appeal is', 'the Section', 'execution of', 'Appeal allowed',
110 | 
111 | 	'the part of', 'to the suit', 'of the Court', 
112 | 
113 | ]
114 | 
115 | categorical_pairs['R'] = {
116 | }
117 | 
118 | 
119 | def include_toggled_list(l):
120 | 	'''
121 | 		If list has summary, it should also have Summary and summary.
122 | 	'''
123 | 	l_ = list()
124 | 	for item in l:
125 | 		l_.append(item + '.')
126 | 	l += l_
127 | 	l_ = list()
128 | 
129 | 	for item in l:
130 | 		first_toggled = item[0].lower() if item[0].isupper() else item[0].upper()
131 | 		l_.append(first_toggled + item[1:])
132 | 	l += l_
133 | 	l = list(set(l))
134 | 
135 | 
136 | def include_toggled_dict(d):
137 | 	'''
138 | 		If dict has summary: x, it should also have Summary: x
139 | 	'''
140 | 	d_ = dict()
141 | 	for key in d:
142 | 		first_toggled = key[0].lower() if key[0].isupper() else key[0].upper()
143 | 		d_[first_toggled + key[1:]] = d[key]
144 | 	# d = {**d, **d_}
145 | 	d.update(d_)
146 | 
147 | 
148 | def include_toggled_phrases(phrases):
149 | 	'''
150 | 		Iterator for each category
151 | 	'''
152 | 	for category in phrases:
153 | 		include_toggled_list(phrases[category])
154 | 
155 | 
156 | def include_toggled_pairs(pairs):
157 | 	'''
158 | 		Iterator for each categorical pair
159 | 	'''
160 | 	total_pairs, c = sum([ len(pairs[category]) for category in pairs ]), 0
161 | 	for category in pairs:
162 | 		# 'If': ['so be'],
163 | 		# to be extended to
164 | 		# 'If': ['so be', 'So be'], 'if': ['so be', 'So be']
165 | 		
166 | 		if category == 'arguing':
167 | 			# print('Passing', category)
168 | 			for key in pairs[category]:
169 | 				pairs[category][key] = list( set( pairs[category][key] ) )
170 | 			pass
171 | 		# print('Not passing', category)
172 | 		for key in pairs[category]:
173 | 			# print ( pairs[category][key] )
174 | 			include_toggled_list( pairs[category][key] )
175 | 			c += 1
176 | 			# print(key, " => ", c, "/", total_pairs)
177 | 
178 | 	c = 0
179 | 	for category in pairs:
180 | 		d2 = {}
181 | 		for key in pairs[category]:
182 | 			k2 = key[0].upper() if key[0].islower() else key[0].lower()
183 | 			d2[k2 + key[1:]] = pairs[category][key]
184 | 			c += 1
185 | 			# print(key, " => ", c, "/", total_pairs)			
186 | 		pairs[category].update(d2)
187 | 
188 | 	# make a set
189 | 	for category in pairs:
190 | 		for key in pairs[category]:
191 | 			pairs[category][key] = list( set( pairs[category][key] ) )


--------------------------------------------------------------------------------
/extractive/LetSum/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/LetSum/letsum.py:
--------------------------------------------------------------------------------
  1 | from rouge import Rouge
  2 | import os
  3 | import html2text
  4 | import nltk
  5 | from collections import OrderedDict 
  6 | import operator
  7 | import html2text
  8 | import pycrfsuite
  9 | import sys
 10 | import time
 11 | 
 12 | import letsum_test
 13 | 
 14 | FULL_TEXT = '../../FullText_html/'
 15 | MANUAL_SUM = '../../CaseAnalysis/'
 16 | MAX_LENGTH_SUMMARY = 100 # Define maxmimum words in summary
 17 | # MAX_PERCENT_SUMMARY = 34 # 10 # 10 in paper, 34 as discussed
 18 | SUMMARY_PERCENT = 34
 19 | REACHED_FILE = ''
 20 | 
 21 | ## Max time taken so far: 3.87 hours
 22 | ##
 23 | ## Usage: python letsum.py
 24 | ## Additionally: python letsum.py from_year to_year reached_file
 25 | ##
 26 | ## Goes upto, but not including to_year, skips files alphabetically smaller than reached_file
 27 | ## 
 28 | 
 29 | 
 30 | # for 2010_B_31
 31 | sys.setrecursionlimit(8735 * 2080 + 10)
 32 | # reset to 1000
 33 | 
 34 | def summary_letsum_looper(from_year=2010, to_year=2019, reached_file=''):
 35 | 	'''
 36 | 		Loop over all files from 2010-2018, generate summaries and compute rouge scores
 37 | 	'''
 38 | 	rouge = Rouge()
 39 | 	scores = OrderedDict()
 40 | 	summaries = {}
 41 | 	REACHED_FILE = reached_file
 42 | 
 43 | 	i = sum([ len( os.listdir( FULL_TEXT + str(year) ) ) for year in range(2010, from_year)]) + 1
 44 | 	i0  = i-1
 45 | 	# length is total number of documents
 46 | 	length = sum([ len( os.listdir( FULL_TEXT + str(year) ) ) for year in range(2010, 2019)])
 47 | 	
 48 | 	for year in range(from_year, to_year):
 49 | 		l = sum([len( os.listdir( FULL_TEXT + str(y) ) ) for y in range(from_year, to_year)])
 50 | 		print('\n\n <=== ', year, ' / ', l, '===>\n\n')
 51 | 		
 52 | 		for case in sorted(os.listdir(FULL_TEXT + str(year))):
 53 | 			print(i, ' / ', length, '=>', case, end='\r')
 54 | 			if REACHED_FILE != '' and case <= REACHED_FILE:
 55 | 				i += 1
 56 | 				continue
 57 | 			i2 = i - i0
 58 | 			
 59 | 			manual_summary = letsum_test.get_manual_summary(MANUAL_SUM   + str(year) + '/' + case)
 60 | 			if manual_summary == -1:
 61 | 				print('+- Score ', i2, ' / ', l, '    ', i, ' / ', length, ' ', case, ' => Skipped')
 62 | 				i += 1
 63 | 				continue
 64 | 
 65 | 			start = time.time()
 66 | 			system_generated_summary = letsum_test.LetSum(FULL_TEXT + str(year) + '/' + case)
 67 | 			running_time = time.time() - start
 68 | 			print(system_generated_summary)
 69 | 			print('Summary generated in ', running_time)
 70 | 
 71 | 			# available are rouge-1, rouge-2, rouge-l
 72 | 			# print(len(system_generated_summary.split(' ')))
 73 | 			# print('\nManual\n', len(manual_summary.split(' ')))
 74 | 			scores[case] = rouge.get_scores(system_generated_summary, manual_summary)[0]
 75 | 			summaries[case] = system_generated_summary
 76 | 			print('+- Score ', i2, ' / ', l, '    ', i, ' / ', length, ' ', case, ' => ', scores[case])
 77 | 			i += 1
 78 | 
 79 | 	sys.setrecursionlimit(1000) # reset
 80 | 	# print('\n\n-----------------------------------------')
 81 | 	# print('Average score for ', len(scores), ' documents')
 82 | 	# print('Fscore: ', sum([scores[case]['f'] for case in scores])) / len(scores)
 83 | 	# print('Precision: ', sum([scores[case]['p'] for case in scores])) / len(scores)
 84 | 	# print('Recall: ', sum([scores[case]['r'] for case in scores])) / len(scores)
 85 | 	
 86 | 	# Not doing top 10, instead handpicked cases
 87 | 	# print('\n Top 10 documents by recall:')
 88 | 	# metric = {case: scores[case]['r'] for case in scores}
 89 | 	# sorted_r = sorted(metric.items(), key=operator.itemgetter(1), reverse=True)
 90 | 	# # printing top 10 rouge scores
 91 | 	# for i in range(10):
 92 | 	# 	case = sorted_r[i][0]
 93 | 	# 	print('# ',i+1, ': ', case, ' => ', sorted_r[i][1])
 94 | 	# 	print(summaries[case], '\n')
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 | 	from_year, to_year, reached_file = 2010, 2019, ''
 99 | 	if len(sys.argv) > 2:
100 | 		from_year, to_year = int(sys.argv[1]), int(sys.argv[2])
101 | 	if len(sys.argv) > 3:
102 | 		reached_file = sys.argv[3]
103 | 	summary_letsum_looper(from_year, to_year, reached_file)


--------------------------------------------------------------------------------
/extractive/MMR/MMR.py:
--------------------------------------------------------------------------------
  1 | from sklearn.feature_extraction.text import TfidfVectorizer
  2 | import nltk
  3 | from nltk.corpus import stopwords 
  4 | from nltk.tokenize import word_tokenize 
  5 | import spacy,os
  6 | import argparse
  7 | import re
  8 | from tqdm import tqdm
  9 | from collections import OrderedDict
 10 | import string
 11 | import numpy as np
 12 | from spacy.lang.en import English
 13 | import time
 14 | nl = English()
 15 | import sys
 16 | import pandas as pd
 17 | 
 18 | repeat = 5
 19 | data = []
 20 | doc = []
 21 | l3 = []
 22 | summary = []
 23 | hypothesis = ""
 24 | word_count = []
 25 | pair_similarity = []
 26 | summary_string = []
 27 | 
 28 | def count_word(index):
 29 |     global doc
 30 |     Doc = nl(doc[index])
 31 |     tokens = [t.text for t in Doc]
 32 |     tokens = [t for t in tokens if len(t.translate(t.maketrans('', '', string.punctuation + string.whitespace))) > 0] # + string.digits
 33 |     return len(tokens)
 34 | 
 35 | def store_word_count():
 36 |     global word_count,doc
 37 |     word_count = []
 38 |     for i in range(0,len(doc)):
 39 |         word_count.append(count_word(i))
 40 |         
 41 | def maximum(index, toPrint=0):
 42 |     global summary, pair_similarity
 43 |     length = len(summary)
 44 |     if(length!=0):
 45 |         max=0
 46 |         for i in range(length):
 47 |             a=pair_similarity[index][summary[i]]
 48 |             if(a>max):
 49 |                 max=a
 50 |             if toPrint:
 51 |               print(str(summary[i])+" -> "+str(a))
 52 |         return max
 53 |     else:
 54 |         return 0
 55 | 
 56 | def count_sum(summary):
 57 |     sum=0
 58 |     length = len(summary)
 59 |     for i in range(length):
 60 |         sum+=count_word(summary[i])
 61 |     return sum
 62 | 
 63 | def mmr_sorted(lambda_, doc, length):
 64 |     global word_count, pair_similarity, summary
 65 |     #print('Inside MMR')
 66 |     print(length)
 67 |     l3 = []
 68 |     vectorizer = TfidfVectorizer(smooth_idf=False)
 69 |     X = vectorizer.fit_transform(doc)
 70 |     y = X.toarray()
 71 |     rows = y.shape[0]
 72 |     cols = y.shape[1]
 73 |     pair_similarity = []
 74 |     for i in range(rows):
 75 |         max=-1
 76 |         pair_similarity.append([])
 77 |         for j in range(rows):
 78 |             if(j!=i):
 79 |                 a = np.sum(np.multiply(y[i],y[j]))
 80 |                 pair_similarity[-1].append(a)
 81 |                 if(a>max):
 82 |                     max=a
 83 |             else:
 84 |                 pair_similarity[-1].append(1)
 85 |         l3.append(max)
 86 |     store_word_count()
 87 |     l = len(doc)  
 88 |     count = 0
 89 |     last = -1
 90 |     summary = []
 91 |     summary_word_count = 0
 92 |     while(1):
 93 |         if (summary_word_count < length):
 94 |             max=-1
 95 |             for i in range(l):
 96 |                 a = maximum(i)
 97 |                 mmrscore = lambda_*l3[i] - (1-lambda_)*a
 98 |                 if(mmrscore >= max):
 99 |                     max = mmrscore
100 |                     ind = i
101 |             summary.append(ind)
102 |             summary_word_count += word_count[ind]
103 |         else:
104 |             #print('Bye')
105 |             break
106 | 
107 | def listToString():  
108 |     global summary_string, word_count, hypothesis, summary, doc
109 |     summary_string = []
110 |     leng = 0
111 |     for i in summary:
112 |       if doc[i] not in summary_string:
113 |           summary_string.append(doc[i])
114 |           leng += word_count[i]
115 |     hypothesis = "".join(summary_string) 
116 | 
117 | 
118 | parser = argparse.ArgumentParser()
119 | parser.add_argument('--data_path', default = 'data/text/', type = str, help = 'Folder containing textual data')
120 | parser.add_argument('--summary_path', default = 'data/summaries/', type = str, help = 'Folder to store features of the textual data')
121 | parser.add_argument('--length_file', default = 'data/length.txt', type = str, help = 'Path to file containing summary length')
122 | 
123 | 
124 | args = parser.parse_args()
125 | 
126 | print('Generating summary in  ...'+args.summary_path)
127 | num_docs = len(os.listdir(args.data_path))
128 | 
129 | X1 = pd.read_csv(args.length_file, sep="\t", header=None)
130 | 
131 | for i in tqdm(range(0,num_docs)):
132 |   length1=X1[1][i]
133 |   #length2=X2[1][i]
134 |   doc = []
135 |   with open(os.path.join(args.data_path,X1[0][i]), 'r') as file:
136 |     for x in file:
137 |       if x != '\n':
138 |         doc.append(x)
139 |   lamda=0.6
140 |   #for j in lamda:
141 |   mmr_sorted(lamda,doc,length1)
142 |   listToString()
143 |   f= open(os.path.join(args.summary_path,X1[0][i]),"w+")
144 |   n = f.write(hypothesis)
145 |   f.close()
146 |   hypothesis=""
147 | 


--------------------------------------------------------------------------------
/extractive/MMR/README.md:
--------------------------------------------------------------------------------
 1 | # MMR
 2 | 
 3 | Implementation of the paper [Automatic Summarization of Legal Decisions using Iterative Masking of Predictive Sentences](https://dl.acm.org/doi/10.1145/3322640.3326728) ICAIL, 2019
 4 | 
 5 | 
 6 | ### Basic command to run:
 7 | `python MMR.py --data_path /path/to/documents/ --summary_path /path.to/save/summaries/ --length_file lengthfile.txt`
 8 | 
 9 | 
10 | ### format of length file
11 | 
12 | ```
13 | filename <TAB> required-summary-length-in-words
14 | ```
15 | 
16 | ### External libraries required
17 | 
18 | - spacy = 2.2.3
19 | - nltk = 3.5
20 | - sklearn = 0.21.3
21 | - tqdm
22 | 


--------------------------------------------------------------------------------
/extractive/MMR/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/RBM/README.md:
--------------------------------------------------------------------------------
 1 | # TextSummarizer
 2 | Implementation of the paper [Extractive Summarization using Deep Learning](https://arxiv.org/pdf/1708.04439.pdf)
 3 | 
 4 | Description: Used two layers of Restricted Boltzmann Machine as a Deep Belief Network to enhance and abstract various features such as named entities, proper nouns, numeric tokens, sentence position etc. to score sentences then selecting the top scores, hence producing an extractive summary.
 5 | 
 6 | ## Prerequisites
 7 |  * [Python 2.7+](https://www.python.org/download/releases/2.7/)
 8 |  * [Numpy](http://www.numpy.org/)
 9 |  * [Scipy](https://www.scipy.org/)
10 |  * [Theano](https://github.com/Theano/Theano)
11 |  * [NLTK](http://www.nltk.org/)
12 | 
13 | ## Getting Started
14 |  1. Clone this repository <br>
15 |  2. Run `python Summarizer.py` to summarize the articles. The first argument will be the directory containing the input files and second argument will be the directory where outout will be stored<br>
16 | 


--------------------------------------------------------------------------------
/extractive/RBM/enhancedfeatureSum:
--------------------------------------------------------------------------------
 1 | -4.14446496262
 2 | -5.22418720821
 3 | -2.08044325936
 4 | 3.51808975667
 5 | -9.1456816337
 6 | 2.28970787758
 7 | -1.31615384616
 8 | -5.99801672717
 9 | 5.89963260743
10 | -6.83383889168
11 | 1.37234532214
12 | 1.13257516088
13 | -8.99964700893
14 | 6.37152248479
15 | -4.7296241192
16 | -3.30522300491
17 | 4.0947679278
18 | -8.66964930836
19 | 0.865994550113
20 | 0.0395620901692
21 | 


--------------------------------------------------------------------------------
/extractive/RBM/entity2.py:
--------------------------------------------------------------------------------
 1 | import nltk 
 2 | 
 3 | sample = "Narenda Modi is the pm of India"
 4 | 
 5 | def extract_entity_names(t):
 6 |     entity_names = []
 7 | 
 8 |     if hasattr(t, 'label') and t.label:
 9 |         if t.label() == 'NE':
10 |             entity_names.append(' '.join([child[0] for child in t]))
11 |         else:
12 |             for child in t:
13 |                 entity_names.extend(extract_entity_names(child))
14 | 
15 |     return entity_names
16 | def ner(sample):
17 |     try:
18 |         sentences = nltk.sent_tokenize(sample)
19 |     except:
20 |         raise Exception
21 |     tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
22 |     tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
23 |     chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
24 | 
25 | 
26 | 
27 |     entity_names = []
28 |     for tree in chunked_sentences:
29 |     # Print results per sentence
30 |     # print extract_entity_names(tree)
31 | 
32 |         entity_names.extend(extract_entity_names(tree))
33 |     print set(entity_names)
34 |     return len(entity_names)


--------------------------------------------------------------------------------
/extractive/RBM/extractiveDemo.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import os
 3 | 
 4 | def readPaths(path):
 5 |   # Create array of array of images.
 6 |   print(path)
 7 |   imagePaths = []
 8 |   filePaths = []
 9 |   # List all files in the directory and read points from text files one by one
10 |   for filePath in sorted(os.listdir(path)):
11 |     # print("10 ", filePath)
12 |     fileExt = os.path.splitext(filePath)[1]
13 |     if fileExt in [".html"]:
14 |       # print (filePath)
15 |       filePaths.append(filePath)
16 | 
17 |       # Add to array of images
18 |       imagePaths.append(os.path.join(path, filePath))
19 | 
20 |   return imagePaths, filePaths
21 | 
22 | print('Enter directory for text files')
23 | dirname2 = input()
24 | for dirs in os.listdir(dirname2):
25 |   if dirs in ['2013', '2014']:
26 | 	# print(type(dirs))
27 |     if os.path.isdir(os.path.join(dirname2,dirs)):
28 |       summaryFiles, names = readPaths(os.path.join(os.path.join(dirname2,dirs),'CaseAnalysis'))
29 |       # print("hello")
30 |       textFiles, names = readPaths(os.path.join(os.path.join(dirname2,dirs),'FullText'))
31 |       # print(summaryFiles)
32 |       # print(textFiles)
33 |       for count in range(0,min(len(summaryFiles),len(textFiles))):
34 |         HtmlFile = open(textFiles[count], 'r', encoding='utf-8')
35 |         source_code = HtmlFile.read()
36 |         soup = BeautifulSoup(source_code, 'html.parser')
37 |         allDiv = soup.find_all('div', class_='locatorSection')
38 |         #f = open("Summary_7.txt","wb")
39 |         text = str()
40 |         for i in allDiv:
41 |             #print(i.find_all('h3'))
42 |             flag = 0
43 |             for h3tag in i.find_all('h3'):
44 |                 if h3tag.get('id') == 'casedigest':
45 |                     flag = 1
46 |                     break
47 |             if flag == 1:
48 |                 for p in i.find_all('p'):
49 |                     if p.find('strong') != None:
50 |                         if p.find('strong').text == 'Summary:':
51 |                             for tag in p.find_all('strong'):
52 |                                 tag.replaceWith('')
53 |                             # print(p.text)
54 |                             text = p.text
55 |                             #f.write(bytes(text,'UTF-8'))
56 | 
57 |         #f.close()
58 |         if any(os.path.basename(textFiles[count]) in  os.path.basename(s) for s in textFiles):
59 |           HtmlFile = open(textFiles[count], 'r', encoding='utf-8')
60 |           source_code = HtmlFile.read()
61 |           soup = BeautifulSoup(source_code, 'html.parser')
62 |           allDiv = soup.find_all('div', class_='docContent')
63 | 
64 |           f = open("Extractive/demo_test/"+dirs+"_"+names[count]+".txt","w")
65 |           print("64 ", dirs+"_"+names[count])
66 |           for i in allDiv:
67 |               footNotes = i.find_all('p', class_='small center')
68 |               for tag in footNotes:
69 |                   tag.replaceWith('')
70 |               # print(i.text)
71 |               # f.write(i.text)
72 |               # f.write("\n\n@highlight\n")
73 |               f.write(text)
74 | 
75 |           f.close()
76 | 


--------------------------------------------------------------------------------
/extractive/RBM/featureSum:
--------------------------------------------------------------------------------
 1 | 4.51799738281
 2 | 1.45031689174
 3 | 1.65310050155
 4 | 0.257102174852
 5 | 3.07559906141
 6 | 2.37254781826
 7 | 0.679321453233
 8 | 1.58868062067
 9 | 0.0396091840985
10 | 1.5214609106
11 | 0.472780015949
12 | 0.590904641577
13 | 3.63993420881
14 | -0.525165960839
15 | 1.52273977015
16 | 1.21188091948
17 | 0.402717131627
18 | 2.26820518638
19 | 2.50662999217
20 | -0.00553588031303
21 | 


--------------------------------------------------------------------------------
/extractive/RBM/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/RBM/ner.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | 
 3 | 
 4 | # tokenize doc
 5 | with open('Texts/Text_1.txt','r') as f:
 6 | 	doc = f.read()
 7 | 
 8 | sent_text = nltk.sent_tokenize(doc)
 9 | 
10 | op = ""
11 | 
12 | for sent in sent_text:
13 |   tokenized_text = nltk.word_tokenize(sent)
14 |   num_present = [(x in summ_word_tokens) for x in tokenized_text]
15 |   num_present = sum(num_present * 1)
16 |   if num_present / len(tokenized_text) < 0.25:
17 |     score = 0
18 |   elif num_present / len(tokenized_text) > 0.75:
19 |     score = 2
20 |   else:
21 |     score = 1
22 |   op += sent + ' \t\t' + str(score) + '\n'
23 | 
24 | print(op)
25 | 
26 | 
27 | tokenized_doc = nltk.word_tokenize(doc)
28 | # tag sentences and use nltk's Named Entity Chunker
29 | tagged_sentences = nltk.pos_tag(tokenized_doc)
30 | ne_chunked_sents = nltk.ne_chunk(tagged_sentences)
31 | 
32 | # extract all named entities
33 | 
34 | named_entities = []
35 | count = 1
36 | entity_list=""
37 | 
38 | 
39 | for tagged_tree in ne_chunked_sents:
40 |     if hasattr(tagged_tree, 'label'):
41 |         entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #
42 |         entity_type = tagged_tree.label() # get NE category
43 |         named_entities.append((entity_name, entity_type))
44 | 
45 | named_entities = list(set(named_entities))
46 | # print(named_entities)
47 | for entities in named_entities:
48 | 	rep_string = "@entity"+str(count)
49 | 	doc = doc.replace(entities[0], rep_string)
50 | 	entity_list+="@entity"+str(count)+":"+entities[0]+"\n"
51 | 	count+=1
52 | 
53 | sentences = nltk.sent_tokenize(doc)
54 | 
55 | # doc+="\n"+entity_list
56 | print(doc)


--------------------------------------------------------------------------------
/extractive/RBM/para_reader.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import readline
 3 | 
 4 | 
 5 | class Paragraphs:
 6 | 
 7 |     def __init__(self, fileobj, separator='\n'):
 8 | 
 9 |         # Ensure that we get a line-reading sequence in the best way possible:
10 |         #import xreadlines
11 |         try:
12 |             # Check if the file-like object has an xreadlines method
13 |             self.seq = fileobj.readlines()
14 |         except AttributeError:
15 |             # No, so fall back to the xreadlines module's implementation
16 |             self.seq = xreadlines.readlines(fileobj)
17 | 
18 |         self.line_num = 0    # current index into self.seq (line number)
19 |         self.para_num = 0    # current index into self (paragraph number)
20 | 
21 |         # Ensure that separator string includes a line-end character at the end
22 |         if separator[-1:] != '\n': separator += '\n'
23 |         self.separator = separator
24 | 
25 | 
26 |     def __getitem__(self, index):
27 |         if index != self.para_num:
28 |             raise TypeError, "Only sequential access supported"
29 |         
30 |         self.para_num += 1
31 |         # Start where we left off and skip 0+ separator lines
32 |         while 1:
33 |         # Propagate IndexError, if any, since we're finished if it occurs
34 |             line = self.seq[self.line_num]
35 |             #print "line : ",line
36 |             self.line_num += 1
37 |             if line != self.separator: break
38 |         # Accumulate 1+ nonempty lines into result
39 |         result = [line]
40 |         while 1:
41 |         # Intercept IndexError, since we have one last paragraph to return
42 |             try:
43 |                 # Let's check if there's at least one more line in self.seq
44 |                 line = self.seq[self.line_num]
45 |                 #print "line 2 : ",line
46 |             except IndexError:
47 |                 # self.seq is finished, so we exit the loop
48 |                 break
49 |             # Increment index into self.seq for next time
50 |             self.line_num += 1
51 |             result.append(line)
52 |             if line == self.separator: break
53 |             
54 |         return ''.join(result)
55 | 
56 | # Here's an example function, showing how to use class Paragraphs:
57 | def show_paragraphs(filename, numpars=20):
58 |     paralist = []
59 |     pp = Paragraphs(open(filename))
60 |     for p in pp:
61 |         #print "Par#%d : %s" % (pp.para_num, repr(p))
62 |         paralist.append(p)
63 |         if pp.para_num>numpars: break
64 | 
65 |     return paralist
66 | #print(show_paragraphs('article2'))
67 | 
68 | 


--------------------------------------------------------------------------------
/extractive/RBM/parseForExtract.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import os
 3 | 
 4 | def readPaths(path):
 5 |   # Create array of array of images.
 6 |   print(path)
 7 |   imagePaths = []
 8 |   filePaths = []
 9 |   # List all files in the directory and read points from text files one by one
10 |   for filePath in sorted(os.listdir(path)):
11 |     # print("10 ", filePath)
12 |     fileExt = os.path.splitext(filePath)[1]
13 |     if fileExt in [".html"]:
14 |       # print (filePath)
15 |       filePaths.append(filePath)
16 | 
17 |       # Add to array of images
18 |       imagePaths.append(os.path.join(path, filePath))
19 | 
20 |   return imagePaths, filePaths
21 | 
22 | print('Enter directory for text files')
23 | dirname2 = input()
24 | for dirs in os.listdir(dirname2):
25 |     # print(type(dirs))
26 |     if os.path.isdir(os.path.join(dirname2,dirs)):
27 |         summaryFiles, names = readPaths(os.path.join(os.path.join(dirname2,dirs),'CaseAnalysis'))
28 |         # print("hello")
29 |         textFiles, names = readPaths(os.path.join(os.path.join(dirname2,dirs),'FullText'))
30 |         # print(summaryFiles)
31 |         # print(textFiles)
32 | 
33 |         for count in range(0,min(len(summaryFiles),len(textFiles))):
34 |             HtmlFile = open(summaryFiles[count], 'r', encoding='utf-8')
35 |             source_code = HtmlFile.read()
36 |             soup = BeautifulSoup(source_code, 'html.parser')
37 |             allDiv = soup.find_all('div', class_='locatorSection')
38 |             #f = open("Summary_7.txt","wb")
39 |             text = str()
40 |             for i in allDiv:
41 |                 #print(i.find_all('h3'))
42 |                 flag = 0
43 |                 for h3tag in i.find_all('h3'):
44 |                     if h3tag.get('id') == 'casedigest':
45 |                         flag = 1
46 |                         break
47 |                 if flag == 1:
48 |                     for p in i.find_all('p'):
49 |                         if p.find('strong') != None:
50 |                             if p.find('strong').text == 'Summary:':
51 |                                 for tag in p.find_all('strong'):
52 |                                     tag.replaceWith('')
53 |                                 # print(p.text)
54 |                                 text = p.text
55 |                                 #f.write(bytes(text,'UTF-8'))
56 | 
57 |             #f.close()
58 |             if any(os.path.basename(summaryFiles[count]) in  os.path.basename(s) for s in textFiles):
59 |                 HtmlFile = open(textFiles[count], 'r', encoding='utf-8')
60 |                 source_code = HtmlFile.read()
61 |                 soup = BeautifulSoup(source_code, 'html.parser')
62 |                 allDiv = soup.find_all('div', class_='docContent')
63 | 
64 |                 f = open("NeuralSum/"+dirs+"_"+names[count]+".txt","w")
65 |                 print("64 ", dirs+"_"+names[count])
66 |                 for i in allDiv:
67 |                     footNotes = i.find_all('p', class_='small center')
68 |                     for tag in footNotes:
69 |                         tag.replaceWith('')
70 |                     # print(i.text)
71 |                     f.write(i.text)
72 |                     # f.write("\n\n@highlight\n")
73 |                     #print('************')
74 |                     #print(text)
75 |                     #print('************')
76 |                     #print(i.text)
77 |                     #print('************')
78 |                     # f.write(text)
79 | 
80 |                 f.close()
81 | 


--------------------------------------------------------------------------------
/extractive/RBM/plot.py:
--------------------------------------------------------------------------------
 1 | import plotly.plotly as py
 2 | import plotly.graph_objs as go
 3 | 
 4 | trace1 = go.Scatter(
 5 |     x=[1, 2, 3, 4, 5, 
 6 |        6, 7, 8, 9, 10,
 7 |        11, 12, 13, 14, 15],
 8 |     y=[10, 20, None, 15, 10,
 9 |        5, 15, None, 20, 10,
10 |        10, 15, 25, 20, 10],
11 |     name = '<b>No</b> Gaps', # Style name/legend entry with html tags
12 |     connectgaps=True
13 | )
14 | trace2 = go.Scatter(
15 |     x=[1, 2, 3, 4, 5,
16 |        6, 7, 8, 9, 10,
17 |        11, 12, 13, 14, 15],
18 |     y=[5, 15, None, 10, 5,
19 |        0, 10, None, 15, 5,
20 |        5, 10, 20, 15, 5],
21 |     name = 'Gaps',
22 | )
23 | 
24 | data = [trace1, trace2]
25 | 
26 | fig = dict(data=data)
27 | py.iplot(fig, filename='simple-connectgaps')
28 | 


--------------------------------------------------------------------------------
/extractive/RBM/plotLines.py:
--------------------------------------------------------------------------------
 1 | import plotly.plotly as py
 2 | import plotly.graph_objs as go
 3 | 
 4 | # trace1 = go.Scatter(
 5 | #     x=[1, 2, 3, 4, 5, 
 6 | #        6, 7, 8, 9, 10,
 7 | #        11, 12, 13, 14, 15],
 8 | #     y=[10, 20, None, 15, 10,
 9 | #        5, 15, None, 20, 10,
10 | #        10, 15, 25, 20, 10],
11 | #     name = '<b>No</b> Gaps', # Style name/legend entry with html tags
12 | #     connectgaps=True
13 | # )
14 | trace1 = go.Scatter(
15 |     y=[0.777777777778,0.8,0.866666666667,0.833333333333,0.791666666667,0.833333333333,0.861111111111,0.836363636364],
16 |     x=[16,20,30,35,47,61,71,109],
17 |     name = '<b>No</b> Gaps', # Style name/legend entry with html tags
18 |     connectgaps=True
19 | )
20 | 
21 | 
22 | # trace2 = go.Scatter(
23 | #     y=[0.823, 0.677,0.75,0.79],
24 | #     x=[25,30,40,45],
25 | #     name = 'Gaps'
26 | # )
27 | 
28 | 
29 | # trace2 = go.Scatter(
30 | #     x=[1, 2, 3, 4, 5,
31 | #        6, 7, 8, 9, 10,
32 | #        11, 12, 13, 14, 15],
33 | #     y=[5, 15, None, 10, 5,
34 | #        0, 10, None, 15, 5,
35 | #        5, 10, 20, 15, 5],
36 | #     name = 'Gaps',
37 | # )
38 | 
39 | data = [trace1]
40 | 
41 | fig = dict(data=data)
42 | py.iplot(fig, filename='precision')
43 | 


--------------------------------------------------------------------------------
/extractive/RBM/saveCre.py:
--------------------------------------------------------------------------------
1 | import plotly 
2 | plotly.tools.set_credentials_file(username='vagisha', api_key='Qcl3DGmQ8oWR81iAHOlV')


--------------------------------------------------------------------------------
/extractive/RBM/sentence_labeller.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import glob
 3 | import tqdm
 4 | import pdb
 5 | 
 6 | for file in tqdm.tqdm(glob.glob('./Test_Again/*.txt')):
 7 |   with open(file, 'r') as f:
 8 |     text = f.read()
 9 |     f.close()
10 |   with open('./Extractive/ref_summ/' + file[13:], 'r') as f:
11 |     summ_sent = f.read()
12 |     f.close()
13 | 
14 |   sent_text = nltk.sent_tokenize(text)
15 | 
16 |   summ_word_tokens = nltk.word_tokenize(summ_sent)
17 | 
18 |   op = ''
19 | 
20 |   for sent in sent_text:
21 |     tokenized_text = nltk.word_tokenize(sent)
22 |     num_present = [x in summ_word_tokens for x in tokenized_text]
23 |     num_present = sum(num_present * 1)
24 |     # pdb.set_trace()
25 |     if num_present / len(tokenized_text) < 0.25:
26 |       score = 0
27 |     elif num_present / len(tokenized_text) > 0.75:
28 |       score = 2
29 |     else:
30 |       score = 1
31 |     op += sent + ' \t\t' + str(score) + '\n'
32 | 
33 |   tokenized_doc = nltk.word_tokenize(op)
34 |   # tag sentences and use nltk's Named Entity Chunker
35 |   tagged_sentences = nltk.pos_tag(tokenized_doc)
36 |   ne_chunked_sents = nltk.ne_chunk(tagged_sentences)
37 | 
38 |   # extract all named entities
39 | 
40 |   named_entities = []
41 |   count = 1
42 |   entity_list=""
43 | 
44 | 
45 |   for tagged_tree in ne_chunked_sents:
46 |       if hasattr(tagged_tree, 'label'):
47 |           entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #
48 |           entity_type = tagged_tree.label() # get NE category
49 |           named_entities.append((entity_name, entity_type))
50 | 
51 |   named_entities = list(set(named_entities))
52 |   # print(named_entities)
53 |   for entities in named_entities:
54 |     rep_string = "@entity"+str(count)
55 |     op = op.replace(entities[0], rep_string)
56 |     summ_sent = summ_sent.replace(entities[0], rep_string)
57 |     entity_list+="@entity"+str(count)+":"+entities[0]+"\n"
58 |     count+=1
59 | 
60 |   # print(op)
61 |   # break
62 |   op+="\n\n"+summ_sent
63 |   op+="\n\n"+entity_list
64 |   with open('./Extractive/test_annotated_again/' + file[13:], 'w') as f:
65 |     f.write(op)
66 |     f.close()
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/extractive/abs_to_ext/README.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | This code converts an abstractive gold standard summary to extractive. Given the full document and its abstractive gold standard summary, the code labels each sentence in the full document either 1 (meaning that it is a summary sentence) or 0 (meaning that it is not a summary sentence).
 4 | 
 5 | The extractive training data thus generated can be used to train extractive supervised summarization methods like [Gist](https://github.com/Law-AI/summarization/tree/aacl/extractive/Gist).
 6 | 
 7 | ### Basic command to run:
 8 | `python extractive_labels.py /path/to/base/directory`
 9 | 
10 | ### To view other options (show help):
11 | `python extractive_labels.py -h`
12 | 
13 | ### External libraries required:
14 | - numpy
15 | - tqdm
16 | - [rouge](https://pypi.org/project/rouge/)
17 | 
18 | 
19 | ### Method reference
20 | You can use one of the two methods for generating labels:
21 | - **avr_rg**: This method selects 3 sentences from the full text for each sentence in the summary wrt the highest average ROUGE scores. Ref: [Narayan.et.al. NAACL 2018](https://www.aclweb.org/anthology/N18-1158/)
22 | - **m_rg**: This method selects greedily selects the maximal sentences from full text to maximize ROUGE scores wrt the summary. Ref: [Nallapati.et.al. AAAI 2017](https://ojs.aaai.org/index.php/AAAI/article/view/10958)
23 | 


--------------------------------------------------------------------------------
/extractive/abs_to_ext/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/annotated/SAKHKKAR_MILLS_MAZDOOR_SANGH-annotated.txt:
--------------------------------------------------------------------------------
 1 | <caseDesc>
 2 | SAKHKKAR MILLS MAZDOOR SANGH V. GWALIOR SUGAR CO. LTD. [1985] INSC 32; AIR 1985 SC 758; 1985 (2) SCR 958; 1985 (2) SCC 134; 1985 (1) SCALE 301 (22 February 1985) 
 3 | </caseDesc>
 4 | 
 5 | <proposition>
 6 | 
 7 | S1 FI The first respondent is a company engaged in the manufacture of sugar and employing over 1100 workers, 300 of them on a permanent basis and 800 on a seasonal basis. 
 8 | 
 9 | S2 FI The permanent employees are those employed on the clerical side and in the operation and maintenance of machines. 
10 | 
11 | S3 FI The other 800 employees are seasonal employees who are so employed because the factory itself does not work through out the year but works during a certain season every year from December when the sugarcane crop is ready for crushing until the crushing is over.
12 | 
13 | S4 FI The employer refused to pay bonus to the seasonal employees during the year 1964-65 on the ground that they were not employed through out the year. 
14 | 
15 | S5 FI A dispute arose between the management and the Mazdoor Sangh which was referred to the Industrial Court Madhya Pradesh, Indore for arbitration under sec.49 of the Madhya Pradesh Industrial Relations Act. 
16 | 
17 | S6 LR The Industrial Court decided in favour of the workers, but on a writ petition filed by the company, the award of the arbitrator was quashed and it was held that the workers were only entitled to proportionate bonus and not the minimum bonus 960 guaranteed by sec. 10 of the Payment of Bonus Act, 1965.
18 | 
19 | S7 I This appeal has been filed by the Mazdoor Sangh under a certificate granted by the High Court of Madhya Pradesh.
20 | 
21 | S8 I To our minds the question is a simple one and is capable of only one answer. 
22 | 
23 | S9 SS Sections 10 and 13 of the Payment of Bonus Act, as they stood at the relevant time, were as follows: Section 10. 
24 | 
25 | S10 SS PAYMENT OF MINIMUM BONUS- Subject to the provisions of ss 8 and 13 every employer shall be bound to pay to every employee in an accounting year a minimum bonus which shall be four per cent of the salary or wage earned by the employee during the accounting year or forty rupees, whichever is higher, whether there are profits in the accounting year or not:
26 | Provided that where such employee has not completed fifteen years of age at the beginning of the accounting year, the provisions of this section shall have effect in relation to such employee as if for the words 'forty rupees', the words 'twenty five rupees" were substituted. 
27 | 
28 | S11 SS Section13-PROPORTIONATE REDUCTION OF BONUS IN CERTAIN CASES-Where an employee has not worked for all the working days in any accounting year, the minimum bonus of forty rupees or, as the case may be, of twenty five rupees, per cent, of his salary or wage for the days he has worked in that accounting year shall be proportionately reduced. 
29 | 
30 | 
31 | S12 LR The High Court has interpreted the words "working days in any accounting year" as meaning all those days of the year except holidays. 
32 | 
33 | S13 R While such an interpretation may be alright in the case of a factory which works all through the year, it would be hardly appropriate in the case of a factory which works during a particular season every year.
34 | 
35 | S14 R In the case of a factory which works seasonally during an accounting year, ' working days in any accounting year" can only mean those days of the year during which the employee concerned is actually allowed to work. 
36 | 
37 | S15 LR That was the interpretation which was placed upon the expression by the Industrial Court  
38 | 
39 | S16 R and we think it is the proper interpretation.
40 | 
41 | S17 R Having regard to the scheme and the purpose of the Act, we do not think that the High Court was justified in placing a different construction on the meaning of the expression "working days in any accounting year. 
42 | 
43 | S18 R We, therefore, set aside the judgment of the High Court and restore the award of the Industrial Court. 
44 | 
45 | S19 R The bonus payable to the employees will carry interest at the rate of nine per cent per annum from the day when the bonus became due until the date of payment.
46 | 
47 | S20 R The appeal is allowed with costs.
48 | 
49 | S21 R N.V.K Appeal allowed.
50 | 
51 | 
52 | </proposition>
53 | 


--------------------------------------------------------------------------------
/extractive/annotated/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/extractive/top_ngrams/gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------