├── .ipynb_checkpoints ├── Summ_Gensim-checkpoint.ipynb ├── Summ_Pytease-checkpoint.ipynb ├── Summ_Sumy_lex-checkpoint.ipynb ├── Summ_Sumy_lsa-checkpoint.ipynb ├── Summ_Sumy_luhn-checkpoint.ipynb └── bleu_model-checkpoint.ipynb ├── README.md ├── Summ_Gensim.ipynb ├── Summ_Pytease.ipynb ├── Summ_Sumy_lex.ipynb ├── Summ_Sumy_lsa.ipynb ├── Summ_Sumy_luhn.ipynb ├── bleu_model.ipynb └── pyrougetextrank.py /.ipynb_checkpoints/Summ_Gensim-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "from gensim.summarization import summarize" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 5, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "def summ_it():\n", 24 | " total = 0\n", 25 | " for file in os.listdir('topics'):\n", 26 | " with open('topics/' + file, 'r') as f:\n", 27 | " summ = summarize(unicode(f.read().replace('\\n',' '), errors='ignore'), word_count=75)\n", 28 | " with open('summ_gensim/' + file.split('.')[0] + '.txt', 'w') as fw:\n", 29 | " fw.writelines(summ)\n", 30 | " total += len(summ)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 6, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "1 loop, best of 3: 57.7 s per loop\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "summ_it()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [] 60 | } 61 | ], 62 | "metadata": { 63 | "anaconda-cloud": {}, 64 | "kernelspec": { 65 | "display_name": "Python [default]", 66 | "language": "python", 67 | "name": "python2" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 2 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython2", 79 | "version": "2.7.12" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 0 84 | } 85 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Summ_Pytease-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "from pyteaser import Summarize" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "for file in os.listdir('topics'):\n", 24 | " with open('topics/' + file, 'r') as f:\n", 25 | " #print file\n", 26 | " summ = Summarize('',unicode(f.read().replace('\\n',' '), errors='ignore'))\n", 27 | " with open('summ_pyteaser/' + file.split('.')[0] + '.txt', 'w') as fw:\n", 28 | " fw.writelines(summ)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [] 39 | } 40 | ], 41 | "metadata": { 42 | "anaconda-cloud": {}, 43 | "kernelspec": { 44 | "display_name": "Python [default]", 45 | "language": "python", 46 | "name": "python2" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 2 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython2", 58 | "version": "2.7.12" 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 0 63 | } 64 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Summ_Sumy_lex-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "from sumy.summarizers.lex_rank import LexRankSummarizer\n", 13 | "from sumy.parsers.plaintext import PlaintextParser\n", 14 | "from sumy.nlp.tokenizers import Tokenizer\n", 15 | "from sumy.nlp.stemmers import Stemmer\n", 16 | "from sumy.utils import get_stop_words" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 6, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "LANGUAGE = \"english\"\n", 28 | "SENTENCES_COUNT = 5" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 7, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "def summ_it():\n", 40 | " length = 0\n", 41 | " total = 0\n", 42 | " for file in os.listdir('topics'):\n", 43 | " with open('topics/' + file, 'r') as f:\n", 44 | " parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n", 45 | " stemmer = Stemmer(LANGUAGE)\n", 46 | " summarizer = LexRankSummarizer(stemmer)\n", 47 | " summarizer.stop_words = get_stop_words(LANGUAGE)\n", 48 | " with open('summ_sumy_lex/' + file.split('.')[0] + '.txt', 'w') as fw:\n", 49 | " for sentence in summarizer(parser.document, SENTENCES_COUNT):\n", 50 | " #print sentence\n", 51 | " fw.writelines(str(sentence))\n", 52 | " length += len(str(sentence))\n", 53 | " total += length\n", 54 | " length = 0" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 8, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "1 loop, best of 3: 41.2 s per loop\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "summ_it()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "anaconda-cloud": {}, 88 | "kernelspec": { 89 | "display_name": "Python [default]", 90 | "language": "python", 91 | "name": "python2" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 2 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython2", 103 | "version": "2.7.12" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 0 108 | } 109 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Summ_Sumy_lsa-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "from sumy.summarizers import luhn\n", 13 | "from sumy.parsers.plaintext import PlaintextParser\n", 14 | "from sumy.nlp.tokenizers import Tokenizer\n", 15 | "from sumy.nlp.stemmers import Stemmer\n", 16 | "from sumy.utils import get_stop_words\n", 17 | "from sumy.summarizers.lsa import LsaSummarizer as Summarizer" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "LANGUAGE = \"english\"\n", 29 | "SENTENCES_COUNT = 2" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "for file in os.listdir('topics'):\n", 41 | " with open('topics/' + file, 'r') as f:\n", 42 | " parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n", 43 | " stemmer = Stemmer(LANGUAGE)\n", 44 | " summarizer = Summarizer(stemmer)\n", 45 | " summarizer.stop_words = get_stop_words(LANGUAGE)\n", 46 | " with open('summ_sumy_lsa/' + file.split('.')[0] + '.txt', 'w') as fw:\n", 47 | " for sentence in summarizer(parser.document, SENTENCES_COUNT):\n", 48 | " #print sentence\n", 49 | " fw.writelines(str(sentence))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [] 60 | } 61 | ], 62 | "metadata": { 63 | "anaconda-cloud": {}, 64 | "kernelspec": { 65 | "display_name": "Python [default]", 66 | "language": "python", 67 | "name": "python2" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 2 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython2", 79 | "version": "2.7.12" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 0 84 | } 85 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Summ_Sumy_luhn-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "from sumy.summarizers.luhn import LuhnSummarizer \n", 13 | "from sumy.parsers.plaintext import PlaintextParser\n", 14 | "from sumy.nlp.tokenizers import Tokenizer\n", 15 | "from sumy.nlp.stemmers import Stemmer\n", 16 | "from sumy.utils import get_stop_words" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 5, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "LANGUAGE = \"english\"\n", 28 | "SENTENCES_COUNT = 1" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 6, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "for file in os.listdir('topics'):\n", 40 | " with open('topics/' + file, 'r') as f:\n", 41 | " parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n", 42 | " stemmer = Stemmer(LANGUAGE)\n", 43 | " summarizer = LuhnSummarizer(stemmer)\n", 44 | " summarizer.stop_words = get_stop_words(LANGUAGE)\n", 45 | " with open('summ_sumy_lex/' + file.split('.')[0] + '.txt', 'w') as fw:\n", 46 | " for sentence in summarizer(parser.document, SENTENCES_COUNT):\n", 47 | " #print sentence\n", 48 | " fw.writelines(str(sentence))" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "While summarizing a document, summarizer invokes __call__() method which takes two arguments, a document and Sentence_Count. The second argument doesn't seem to be effective while summarizing the document. Even with SENTENCE_COUNT = 1 summaries generated have more than 1 sentence." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [] 66 | } 67 | ], 68 | "metadata": { 69 | "anaconda-cloud": {}, 70 | "kernelspec": { 71 | "display_name": "Python [default]", 72 | "language": "python", 73 | "name": "python2" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 2 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython2", 85 | "version": "2.7.12" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 0 90 | } 91 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/bleu_model-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import nltk\n", 12 | "import os\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "path2model = 'path to model summary'\n", 25 | "path2gold = 'path to reference summary'" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "files = os.listdir(path2model)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "bleu_scores = []\n", 48 | "for i in files:\n", 49 | " reference = []\n", 50 | " gold_summ = os.listdir(path2gold+'/' + i.split('.txt')[0])\n", 51 | " with open(path2model + '/' + i,'r') as fmodel:\n", 52 | " textmodel = fmodel.read()\n", 53 | " fmodel.close()\n", 54 | " for j in gold_summ:\n", 55 | " with open(path2gold + '/' + i.split('.txt')[0] + '/' + j,'r') as fgold: \n", 56 | " textgold = fgold.read()\n", 57 | " fmodel.close()\n", 58 | " reference.append(textgold)\n", 59 | " \n", 60 | " \n", 61 | " bleu_scores.append(nltk.translate.bleu_score.sentence_bleu(reference, textmodel, weights=[0.4,0.3,0.2]))" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "np.mean(bleu_scores)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "np.std(bleu_scores)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": true 91 | }, 92 | "outputs": [], 93 | "source": [] 94 | } 95 | ], 96 | "metadata": { 97 | "anaconda-cloud": {}, 98 | "kernelspec": { 99 | "display_name": "Python [conda root]", 100 | "language": "python", 101 | "name": "conda-root-py" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 2 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython2", 113 | "version": "2.7.12" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 1 118 | } 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scripts-for-extractive-summarization 2 | Scripts for an upcoming blog "Extractive vs. Abstractive Summarization" for RaRe Technologies. 3 | 4 | Requirements: 5 | 1. Gensim 6 | 2. Pyteaser 7 | 3. Sumy 8 | 4. NLTK 9 | 5. pythonrouge by tagguci 10 | 11 | The following extractive summarization algorithms have been used: 12 | 1. Gensim's TextRank 13 | 2. Sumy's implementation of Luhn's algorithm 14 | 3. Sumy's LexRank implementation 15 | 4. Sumy's LSA implementation 16 | 5. Pyteaser: python implementation of TextTeaser. 17 | 18 | Link to the blog: https://t.co/YidmkzqRVW 19 | -------------------------------------------------------------------------------- /Summ_Gensim.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "from gensim.summarization import summarize" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 5, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "def summ_it():\n", 24 | " total = 0\n", 25 | " for file in os.listdir('topics'):\n", 26 | " with open('topics/' + file, 'r') as f:\n", 27 | " summ = summarize(unicode(f.read().replace('\\n',' '), errors='ignore'), word_count=75)\n", 28 | " with open('summ_gensim/' + file.split('.')[0] + '.txt', 'w') as fw:\n", 29 | " fw.writelines(summ)\n", 30 | " total += len(summ)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 6, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "1 loop, best of 3: 57.7 s per loop\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "summ_it()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [] 60 | } 61 | ], 62 | "metadata": { 63 | "anaconda-cloud": {}, 64 | "kernelspec": { 65 | "display_name": "Python [default]", 66 | "language": "python", 67 | "name": "python2" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 2 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython2", 79 | "version": "2.7.12" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 0 84 | } 85 | -------------------------------------------------------------------------------- /Summ_Pytease.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "from pyteaser import Summarize" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "for file in os.listdir('topics'):\n", 24 | " with open('topics/' + file, 'r') as f:\n", 25 | " #print file\n", 26 | " summ = Summarize('',unicode(f.read().replace('\\n',' '), errors='ignore'))\n", 27 | " with open('summ_pyteaser/' + file.split('.')[0] + '.txt', 'w') as fw:\n", 28 | " fw.writelines(summ)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [] 39 | } 40 | ], 41 | "metadata": { 42 | "anaconda-cloud": {}, 43 | "kernelspec": { 44 | "display_name": "Python [default]", 45 | "language": "python", 46 | "name": "python2" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 2 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython2", 58 | "version": "2.7.12" 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 0 63 | } 64 | -------------------------------------------------------------------------------- /Summ_Sumy_lex.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "from sumy.summarizers.lex_rank import LexRankSummarizer\n", 13 | "from sumy.parsers.plaintext import PlaintextParser\n", 14 | "from sumy.nlp.tokenizers import Tokenizer\n", 15 | "from sumy.nlp.stemmers import Stemmer\n", 16 | "from sumy.utils import get_stop_words" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 6, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "LANGUAGE = \"english\"\n", 28 | "SENTENCES_COUNT = 5" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 7, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "def summ_it():\n", 40 | " length = 0\n", 41 | " total = 0\n", 42 | " for file in os.listdir('topics'):\n", 43 | " with open('topics/' + file, 'r') as f:\n", 44 | " parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n", 45 | " stemmer = Stemmer(LANGUAGE)\n", 46 | " summarizer = LexRankSummarizer(stemmer)\n", 47 | " summarizer.stop_words = get_stop_words(LANGUAGE)\n", 48 | " with open('summ_sumy_lex/' + file.split('.')[0] + '.txt', 'w') as fw:\n", 49 | " for sentence in summarizer(parser.document, SENTENCES_COUNT):\n", 50 | " #print sentence\n", 51 | " fw.writelines(str(sentence))\n", 52 | " length += len(str(sentence))\n", 53 | " total += length\n", 54 | " length = 0" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 8, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "1 loop, best of 3: 41.2 s per loop\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "summ_it()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "anaconda-cloud": {}, 88 | "kernelspec": { 89 | "display_name": "Python [default]", 90 | "language": "python", 91 | "name": "python2" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 2 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython2", 103 | "version": "2.7.12" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 0 108 | } 109 | -------------------------------------------------------------------------------- /Summ_Sumy_lsa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "from sumy.summarizers import luhn\n", 13 | "from sumy.parsers.plaintext import PlaintextParser\n", 14 | "from sumy.nlp.tokenizers import Tokenizer\n", 15 | "from sumy.nlp.stemmers import Stemmer\n", 16 | "from sumy.utils import get_stop_words\n", 17 | "from sumy.summarizers.lsa import LsaSummarizer as Summarizer" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "LANGUAGE = \"english\"\n", 29 | "SENTENCES_COUNT = 2" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "for file in os.listdir('topics'):\n", 41 | " with open('topics/' + file, 'r') as f:\n", 42 | " parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n", 43 | " stemmer = Stemmer(LANGUAGE)\n", 44 | " summarizer = Summarizer(stemmer)\n", 45 | " summarizer.stop_words = get_stop_words(LANGUAGE)\n", 46 | " with open('summ_sumy_lsa/' + file.split('.')[0] + '.txt', 'w') as fw:\n", 47 | " for sentence in summarizer(parser.document, SENTENCES_COUNT):\n", 48 | " #print sentence\n", 49 | " fw.writelines(str(sentence))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [] 60 | } 61 | ], 62 | "metadata": { 63 | "anaconda-cloud": {}, 64 | "kernelspec": { 65 | "display_name": "Python [default]", 66 | "language": "python", 67 | "name": "python2" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 2 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython2", 79 | "version": "2.7.12" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 0 84 | } 85 | -------------------------------------------------------------------------------- /Summ_Sumy_luhn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import os\n", 12 | "from sumy.summarizers.luhn import LuhnSummarizer \n", 13 | "from sumy.parsers.plaintext import PlaintextParser\n", 14 | "from sumy.nlp.tokenizers import Tokenizer\n", 15 | "from sumy.nlp.stemmers import Stemmer\n", 16 | "from sumy.utils import get_stop_words" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 5, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "LANGUAGE = \"english\"\n", 28 | "SENTENCES_COUNT = 1" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 6, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "for file in os.listdir('topics'):\n", 40 | " with open('topics/' + file, 'r') as f:\n", 41 | " parser = PlaintextParser.from_string(unicode(f.read().replace('\\n',' '), errors='ignore'), Tokenizer(LANGUAGE))\n", 42 | " stemmer = Stemmer(LANGUAGE)\n", 43 | " summarizer = LuhnSummarizer(stemmer)\n", 44 | " summarizer.stop_words = get_stop_words(LANGUAGE)\n", 45 | " with open('summ_sumy_lex/' + file.split('.')[0] + '.txt', 'w') as fw:\n", 46 | " for sentence in summarizer(parser.document, SENTENCES_COUNT):\n", 47 | " #print sentence\n", 48 | " fw.writelines(str(sentence))" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "While summarizing a document, summarizer invokes __call__() method which takes two arguments, a document and Sentence_Count. The second argument doesn't seem to be effective while summarizing the document. Even with SENTENCE_COUNT = 1 summaries generated have more than 1 sentence." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [] 66 | } 67 | ], 68 | "metadata": { 69 | "anaconda-cloud": {}, 70 | "kernelspec": { 71 | "display_name": "Python [default]", 72 | "language": "python", 73 | "name": "python2" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 2 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython2", 85 | "version": "2.7.12" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 0 90 | } 91 | -------------------------------------------------------------------------------- /bleu_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import nltk\n", 12 | "import os\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "path2model = 'path to model summary'\n", 25 | "path2gold = 'path to reference summary'" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "files = os.listdir(path2model)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "bleu_scores = []\n", 48 | "for i in files:\n", 49 | " reference = []\n", 50 | " gold_summ = os.listdir(path2gold+'/' + i.split('.txt')[0])\n", 51 | " with open(path2model + '/' + i,'r') as fmodel:\n", 52 | " textmodel = fmodel.read()\n", 53 | " fmodel.close()\n", 54 | " for j in gold_summ:\n", 55 | " with open(path2gold + '/' + i.split('.txt')[0] + '/' + j,'r') as fgold: \n", 56 | " textgold = fgold.read()\n", 57 | " fmodel.close()\n", 58 | " reference.append(textgold)\n", 59 | " \n", 60 | " \n", 61 | " bleu_scores.append(nltk.translate.bleu_score.sentence_bleu(reference, textmodel, weights=[0.4,0.3,0.2]))" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "np.mean(bleu_scores)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "np.std(bleu_scores)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": true 91 | }, 92 | "outputs": [], 93 | "source": [] 94 | } 95 | ], 96 | "metadata": { 97 | "anaconda-cloud": {}, 98 | "kernelspec": { 99 | "display_name": "Python [conda root]", 100 | "language": "python", 101 | "name": "conda-root-py" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 2 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython2", 113 | "version": "2.7.12" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 1 118 | } 119 | -------------------------------------------------------------------------------- /pyrougetextrank.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from pythonrouge import pythonrouge 4 | 5 | path2model = 'path to model summary' 6 | path2gold = 'path to reference summaries' 7 | files = os.listdir(path2model) 8 | final_mean = [] 9 | for i in files: 10 | gold_summ = os.listdir(path2gold+'/' + i.split('.txt')[0]) 11 | with open(path2model + '/' + i,'r') as fmodel: 12 | textmodel = fmodel.read() 13 | fmodel.close() 14 | score = [] 15 | for j in gold_summ: 16 | with open(path2gold + '/' + i.split('.txt')[0] + '/' + j,'r') as fgold: 17 | textgold = fgold.read() 18 | fmodel.close() 19 | score.append(pythonrouge.pythonrouge(textmodel, textgold)['ROUGE-1']) 20 | final_mean.append(np.max(score)) 21 | print np.max(score) 22 | 23 | print np.mean(final_mean) 24 | print np.std(final_mean) 25 | --------------------------------------------------------------------------------